--- /dev/null
+3.8.6-1
+e8ef53c4fa26582ac37b5e0101b7451a70263f6c
--- /dev/null
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
--- /dev/null
+not-y := n
+not-n := y
+m-not-y := n
+m-not-n := m
+
+ifeq ($(CONFIG_X86_MICPCI),)
+CONFIG_X86_MICPCI := n
+endif
+ifeq ($(CONFIG_X86_MICPCI)$(MIC_CARD_ARCH),n)
+$(error building for host, but $$(MIC_CARD_ARCH) is unset)
+endif
+ifneq ($(MIC_CARD_ARCH),$(firstword $(filter l1om k1om,$(MIC_CARD_ARCH))))
+$(error $$(MIC_CARD_ARCH) must be l1om or k1om)
+endif
+
+# Force optimization to -O2 in case the kernel was configured to use
+# -Os. The main reason is pretty dumb -- -Os has a warning -O2 doesn't,
+# and we compile with -Werror internally. Another reason is that -O2 is
+# what we're used to in terms of validation and performance analysis. We
+# should probably get rid of this, though.
+subdir-ccflags-y += -O2
+
+# Makes it easy to inject "-Werror" from the environment
+subdir-ccflags-y += $(KERNWARNFLAGS)
+
+# Bake some information about who built the module(s), and what version
+# of the source code they started with. Possibly useful during debug.
+subdir-ccflags-y += -DBUILD_NUMBER=\"'$(MPSS_BUILDNO)'\"
+subdir-ccflags-y += -DBUILD_BYWHOM=\"'$(MPSS_BUILTBY)'\"
+subdir-ccflags-y += -DBUILD_ONDATE=\"'$(MPSS_BUILTON)'\"
+subdir-ccflags-y += -DBUILD_SCMVER=\"'$(MPSS_COMMIT)'\"
+subdir-ccflags-y += -DBUILD_VERSION=\"'$(or $(MPSS_VERSION),0.0) ($(MPSS_BUILTBY))'\"
+
+# Code common with the host mustn't use CONFIG_M[LK]1OM directly.
+# But of course it does anyway. Arrgh.
+subdir-ccflags-$(CONFIG_ML1OM) += -DMIC_IS_L1OM
+subdir-ccflags-$(CONFIG_MK1OM) += -DMIC_IS_K1OM
+ifeq ($(MIC_CARD_ARCH),l1om)
+subdir-ccflags-y += -DMIC_IS_L1OM -DCONFIG_ML1OM
+endif
+ifeq ($(MIC_CARD_ARCH),k1om)
+subdir-ccflags-y += -DMIC_IS_K1OM -DCONFIG_MK1OM
+endif
+
+# a shorthand for "runs on the card"?
+subdir-ccflags-$(CONFIG_X86_MICPCI) += -D_MIC_SCIF_
+
+# "runs on the host"
+subdir-ccflags-$(not-$(CONFIG_X86_MICPCI)) += -DHOST -DUSE_VCONSOLE
+
+# always set? what's this thing's purpose?
+subdir-ccflags-y += -D__LINUX_GPL__ -D_MODULE_SCIF_
+
+subdir-ccflags-y += -I$(M)/include
+
+obj-$(CONFIG_X86_MICPCI) += dma/ micscif/ pm_scif/ ras/
+obj-$(CONFIG_X86_MICPCI) += vcons/ vnet/ mpssboot/ ramoops/ virtio/
+
+obj-$(m-not-$(CONFIG_X86_MICPCI)) += mic.o
+
+mic-objs :=
+mic-objs += dma/mic_dma_lib.o
+mic-objs += dma/mic_dma_md.o
+mic-objs += host/acptboot.o
+mic-objs += host/ioctl.o
+mic-objs += host/linpm.o
+mic-objs += host/linpsmi.o
+mic-objs += host/linscif_host.o
+mic-objs += host/linsysfs.o
+mic-objs += host/linux.o
+mic-objs += host/linvcons.o
+mic-objs += host/linvnet.o
+mic-objs += host/micpsmi.o
+mic-objs += host/micscif_pm.o
+mic-objs += host/pm_ioctl.o
+mic-objs += host/pm_pcstate.o
+mic-objs += host/tools_support.o
+mic-objs += host/uos_download.o
+mic-objs += host/vhost/mic_vhost.o
+mic-objs += host/vhost/mic_blk.o
+mic-objs += host/vmcore.o
+mic-objs += micscif/micscif_api.o
+mic-objs += micscif/micscif_debug.o
+mic-objs += micscif/micscif_fd.o
+mic-objs += micscif/micscif_intr.o
+mic-objs += micscif/micscif_nm.o
+mic-objs += micscif/micscif_nodeqp.o
+mic-objs += micscif/micscif_ports.o
+mic-objs += micscif/micscif_rb.o
+mic-objs += micscif/micscif_rma_dma.o
+mic-objs += micscif/micscif_rma_list.o
+mic-objs += micscif/micscif_rma.o
+mic-objs += micscif/micscif_select.o
+mic-objs += micscif/micscif_smpt.o
+mic-objs += micscif/micscif_sysfs.o
+mic-objs += micscif/micscif_va_gen.o
+mic-objs += micscif/micscif_va_node.o
+mic-objs += vnet/micveth_dma.o
+mic-objs += vnet/micveth_param.o
+
+version-le = $(shell printf '%s\n' $(1) | sort -t. -k 1,1n -k 2,2n -k 3,3n -k 4,4n -c >/dev/null 2>&1 && echo t)
+ifeq ($(call version-le, 2.6.23 $(KERNELRELEASE)),t)
+ccflags-y += $(mic-cflags)
+else
+$(error building against kernels <= 2.6.23 is broken)
+endif
--- /dev/null
+# Copyright 2010-2017 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License, version 2,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# Disclaimer: The codes contained in these modules may be specific to
+# the Intel Software Development Platform codenamed Knights Ferry,
+# and the Intel product codenamed Knights Corner, and are not backward
+# compatible with other Intel products. Additionally, Intel will NOT
+# support the codes or instruction set in future products.
+#
+# Intel offers no warranty of any kind regarding the code. This code is
+# licensed on an "AS IS" basis and Intel is not obligated to provide
+# any support, assistance, installation, training, or other services
+# of any kind. Intel is also not obligated to provide any updates,
+# enhancements or extensions. Intel specifically disclaims any warranty
+# of merchantability, non-infringement, fitness for any particular
+# purpose, and any other warranty.
+#
+# Further, Intel disclaims all liability of any kind, including but
+# not limited to liability for infringement of any proprietary rights,
+# relating to the use of the code, even if Intel is notified of the
+# possibility of such liability. Except as expressly stated in an Intel
+# license agreement provided with this code and agreed upon with Intel,
+# no license, express or implied, by estoppel or otherwise, to any
+# intellectual property rights is granted herein.
+
+MPSS_COMMIT ?= $(or $(shell sed -ne '2 p' .mpss-metadata 2>/dev/null), \
+ $(error .mpss-metadata file is missing or incorrect))
+MPSS_VERSION ?= $(or $(shell sed -ne '1 p' .mpss-metadata 2>/dev/null), \
+ $(error .mpss-metadata file is missing or incorrect))
+MPSS_BUILDNO ?= 0
+export MPSS_COMMIT := $(MPSS_COMMIT)
+export MPSS_VERSION := $(MPSS_VERSION)
+export MPSS_BUILDNO := $(MPSS_BUILDNO)
+export MPSS_BUILTBY := $(shell echo "`whoami`@`uname -n`")
+export MPSS_BUILTON := $(shell date +'%F %T %z')
+
+KERNEL_VERSION := $(shell uname -r)
+KERNEL_SRC = /lib/modules/$(KERNEL_VERSION)/build
+
+INSTALL = install
+INSTALL_d = $(INSTALL) -d
+INSTALL_x = $(INSTALL)
+INSTALL_f = $(INSTALL) -m644
+
+prefix = /usr/local
+sysconfdir = $(prefix)/etc
+includedir = $(prefix)/include
+
+kmodinstalldir = /lib/modules/$(KERNEL_VERSION)
+kmodincludedir = $(realpath $(KERNEL_SRC))/include/modules
+
+# If building the host's driver for a MIC co-processor card, which card
+# $(ARCH) it should support
+export MIC_CARD_ARCH
+
+.PHONY: all install modules
+.PHONY: modules_install conf_install dev_install kdev_install
+
+all: modules
+
+install: modules_install conf_install kdev_install
+
+modules modules_install: %:
+ $(MAKE) -C $(KERNEL_SRC) M=$(CURDIR) $* \
+ INSTALL_MOD_PATH=$(DESTDIR)
+
+conf_install:
+ifneq ($(MIC_CARD_ARCH),)
+ $(INSTALL_d) $(DESTDIR)$(sysconfdir)/sysconfig/modules
+ $(INSTALL_x) mic.modules $(DESTDIR)$(sysconfdir)/sysconfig/modules
+ $(INSTALL_d) $(DESTDIR)$(sysconfdir)/modprobe.d
+ $(INSTALL_f) mic.conf $(DESTDIR)$(sysconfdir)/modprobe.d
+endif
+ $(INSTALL_d) $(DESTDIR)$(sysconfdir)/udev/rules.d
+ $(INSTALL_f) udev-mic.rules $(DESTDIR)$(sysconfdir)/udev/rules.d/50-udev-mic.rules
+
+dev_install:
+ $(INSTALL_d) $(DESTDIR)$(includedir)/mic
+ $(INSTALL_f) include/scif_ioctl.h $(DESTDIR)$(includedir)
+ $(INSTALL_f) include/mic/io_interface.h $(DESTDIR)$(includedir)/mic
+ $(INSTALL_f) include/mic/mic_pm.h $(DESTDIR)$(includedir)/mic
+ $(INSTALL_f) ras/micras_api.h $(DESTDIR)$(includedir)/mic
+ $(INSTALL_f) ras/micmca_api.h $(DESTDIR)$(includedir)/mic
+ifeq ($(MIC_CARD_ARCH),) # Card side
+ $(INSTALL_f) ras/micpm_api.h $(DESTDIR)$(includedir)/mic
+ $(INSTALL_f) ras/micras.h $(DESTDIR)$(includedir)/mic
+else # Host side
+ $(INSTALL_f) include/mic/micbaseaddressdefine.h $(DESTDIR)$(includedir)/mic
+ $(INSTALL_f) include/mic/micsboxdefine.h $(DESTDIR)$(includedir)/mic
+ $(INSTALL_f) include/mic/micdboxdefine.h $(DESTDIR)$(includedir)/mic
+ $(INSTALL_f) ras/micpm_api.h $(DESTDIR)$(includedir)/mic
+endif
+
+kdev_install:
+ $(INSTALL_d) $(DESTDIR)$(kmodinstalldir)
+ $(INSTALL_f) Module.symvers $(DESTDIR)$(kmodinstalldir)/scif.symvers
+ $(INSTALL_d) $(DESTDIR)$(kmodincludedir)
+ $(INSTALL_f) include/scif.h $(DESTDIR)$(kmodincludedir)
--- /dev/null
+ccflags-y += -DDMA_CHAN_MIC_OWNER=0
+
+obj-m := dma_module.o
+
+dma_module-objs := mic_dma_lib.o mic_dma_md.o mic_sbox_md.o
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include<linux/module.h>
+#include<linux/init.h>
+#include<linux/slab.h>
+#include<asm/io.h>
+#include<linux/mm.h>
+#include<linux/kernel.h>
+#include<linux/interrupt.h>
+#include<linux/proc_fs.h>
+#include<linux/bitops.h>
+#include<linux/version.h>
+#ifdef _MIC_SCIF_
+#include <asm/mic/mic_common.h>
+#ifdef CONFIG_PAGE_CACHE_DMA
+#include <linux/mic_dma/mic_dma_callback.h>
+#endif
+#endif
+
+#ifndef _MIC_SCIF_
+#include <mic/micscif.h>
+#include "mic_common.h"
+#endif
+
+#include <mic/mic_dma_lib.h>
+#include <mic/micscif_smpt.h>
+#include <mic/mic_dma_md.h>
+#include <mic/mic_dma_api.h>
+#include <mic/compl_buf_ring.h>
+#include <mic/micscif_smpt.h>
+#include <mic/micsboxdefine.h>
+
+MODULE_LICENSE("GPL");
+
+#ifdef MIC_IS_EMULATION
+#define DMA_TO (INT_MAX)
+#define DMA_FENCE_TIMEOUT_CNT (INT_MAX)
+#else
+#define DMA_TO (5 * HZ)
+#define DMA_SLOWEST_BW (300) // 300Mbps
+// the maximum size for each decriptor entry is 2M
+#define DMA_FENCE_TIMEOUT_CNT (2 * MIC_MAX_NUM_DESC_PER_RING /DMA_SLOWEST_BW/ (DMA_TO/HZ))
+#endif
+
+#ifdef _MIC_SCIF_
+#define MAX_DMA_XFER_SIZE MIC_MAX_DMA_XFER_SIZE
+#else
+/* Use 512K as the maximum descriptor transfer size for Host */
+#define MAX_DMA_XFER_SIZE (((1U) * 1024 * 1024) >> 1)
+#endif
+#ifndef KASSERT
+#define KASSERT(x, y, ...) \
+ do { \
+ if(!x) \
+ printk(y, ##__VA_ARGS__);\
+ BUG_ON(!x); \
+ } while(0)
+#endif
+/*
+ * Arrary of per device DMA contexts. The card only uses index 0. The host uses one
+ * context per card starting from 0.
+ */
+static struct mic_dma_ctx_t *mic_dma_context[MAX_BOARD_SUPPORTED + 1];
+static struct mutex lock_dma_dev_init[MAX_BOARD_SUPPORTED + 1];
+
+enum mic_desc_format_type {
+ NOP,
+ MEMCOPY,
+ STATUS,
+ GENERAL,
+ KEYNONCECNT,
+ KEY
+};
+char proc_dma_reg[]="mic_dma_registers_";
+char proc_dma_ring[]="mic_dma_ring_";
+
+#define PR_PREFIX "DMA_LIB_MI:"
+#define DMA_DESC_RING_SIZE MIC_MAX_NUM_DESC_PER_RING
+#define MAX_POLLING_BUFFERS DMA_DESC_RING_SIZE
+
+#define DMA_PROC
+static void mic_dma_proc_init(struct mic_dma_ctx_t *dma_ctx);
+static void mic_dma_proc_uninit(struct mic_dma_ctx_t *dma_ctx);
+
+/*
+ * TODO: This is size of s/w interrupt ring.
+ * We need to figure out a value so that we don't run out of memory in
+ * interrupt ring and at the same time don't waste memory
+ */
+#define NUM_COMP_BUFS (((PAGE_SIZE/sizeof(struct dma_completion_cb*)) - 10) * 10)
+
+struct intr_compl_buf_ring {
+ struct dma_completion_cb **comp_cb_array;
+ struct compl_buf_ring ring;
+ int old_tail;
+};
+
+struct mic_dma_ctx_t; /* Forward Declaration */
+
+struct dma_channel {
+ int ch_num;/*Duplicated in md_mic_dma_chan struct too*/
+ struct md_mic_dma_chan *chan;
+ atomic_t flags;
+ wait_queue_head_t intr_wq;
+ wait_queue_head_t access_wq;
+ union md_mic_dma_desc *desc_ring_bak;
+ union md_mic_dma_desc *desc_ring;
+ phys_addr_t desc_ring_phys;
+ uint64_t next_write_index; /* next write index into desc ring */
+ struct intr_compl_buf_ring intr_ring;
+ struct compl_buf_ring poll_ring;
+ struct mic_dma_ctx_t *dma_ctx; /* Pointer to parent DMA context */
+};
+
+/* Per MIC device (per MIC board) DMA context */
+struct mic_dma_ctx_t {
+ struct dma_channel dma_channels[MAX_NUM_DMA_CHAN];
+ int last_allocated_dma_channel_num;
+ struct mic_dma_device dma_dev;
+ int device_num;
+ atomic_t ref_count; /* Reference count */
+ atomic_t ch_num;
+};
+
+/* DMA Library Init/Uninit Routines */
+static int mic_dma_lib_init(uint8_t *mmio_va_base, struct mic_dma_ctx_t *dma_ctx);
+static void mic_dma_lib_uninit(struct mic_dma_ctx_t *dma_ctx);
+
+int get_chan_num(struct dma_channel *chan)
+{
+ return chan->ch_num;
+}
+EXPORT_SYMBOL(get_chan_num);
+
+void initdmaglobalvar(void)
+{
+ memset(mic_dma_context, 0, sizeof(struct mic_dma_ctx_t *) * (MAX_BOARD_SUPPORTED + 1));
+}
+
+static void
+ack_dma_interrupt(struct dma_channel *ch)
+{
+ md_mic_dma_chan_mask_intr(&ch->dma_ctx->dma_dev, ch->chan);
+ md_mic_dma_chan_unmask_intr(&ch->dma_ctx->dma_dev, ch->chan);
+}
+
+/* Returns true if the next write index is "within" bounds */
+static inline bool verify_next_write_index(struct dma_channel *ch)
+{
+ bool ret = false;
+
+ if (ch->next_write_index < DMA_DESC_RING_SIZE)
+ ret = true;
+ else
+ printk(KERN_ERR "%s %d OOB ch_num 0x%x next_write_index 0x%llx\n",
+ __func__, __LINE__,
+ ch->ch_num, ch->next_write_index);
+ return ret;
+}
+
+/* TODO:
+ * See if we can use __get_free_pages or something similar
+ * get_free_pages expects a power of 2 number of pages
+ */
+static void
+alloc_dma_desc_ring_mem(struct dma_channel *ch, struct mic_dma_ctx_t *dma_ctx)
+{
+#ifndef _MIC_SCIF_
+ struct pci_dev *pdev;
+#endif
+ /* Is there any kernel allocator which provides the
+ * option to give the alignment??
+ */
+ ch->desc_ring = kzalloc(
+ (DMA_DESC_RING_SIZE * sizeof(*ch->desc_ring)) + PAGE_SIZE, GFP_KERNEL);
+ ch->desc_ring_bak = ch->desc_ring;
+ ch->desc_ring = (union md_mic_dma_desc *)ALIGN(
+ (uint64_t)ch->desc_ring, PAGE_SIZE);
+#ifdef _MIC_SCIF_
+ ch->desc_ring_phys = virt_to_phys(ch->desc_ring);
+#else
+ micscif_pci_dev(dma_ctx->device_num, &pdev);
+ ch->desc_ring_phys = mic_map_single(dma_ctx->device_num - 1, pdev, (void *)ch->desc_ring,
+ (DMA_DESC_RING_SIZE * sizeof(*ch->desc_ring)) + PAGE_SIZE);
+ BUG_ON(pci_dma_mapping_error(pdev, ch->desc_ring_phys));
+#endif
+}
+
+/*
+ * Call completion cb functions:
+ * Take care of case where we allocated temp buf
+ */
+static void
+mic_dma_lib_interrupt_handler(struct dma_channel *chan)
+{
+ int i = 0;
+ int ring_size = chan->intr_ring.ring.size;
+ struct dma_completion_cb **temp = chan->intr_ring.comp_cb_array;
+ struct dma_completion_cb *cb;
+ int new_tail, old_tail;
+
+ if (mic_hw_family(chan->dma_ctx->device_num) == FAMILY_KNC &&
+ mic_hw_stepping(chan->dma_ctx->device_num) >= KNC_B0_STEP) {
+ unsigned long error = *((uint32_t*)chan->chan->dstat_wb_loc);
+ if (unlikely(test_bit(31, &error)))
+ printk(KERN_ERR "DMA h/w error - %s %d, dstatwb=%lx\n",
+ __func__, __LINE__, error);
+ }
+ new_tail = read_tail(&chan->intr_ring.ring);
+ old_tail = chan->intr_ring.old_tail;
+
+ for (; i < ring_size && old_tail != new_tail;
+ old_tail = incr_rb_index(old_tail, ring_size), i++) {
+ cb = (struct dma_completion_cb *)xchg(&temp[old_tail], NULL);
+ if (cb) {
+ cb->dma_completion_func(cb->cb_cookie);
+ }
+ }
+ chan->intr_ring.old_tail = new_tail;
+ update_tail(&chan->intr_ring.ring, new_tail);
+ wake_up(&chan->intr_wq);
+ if (i == ring_size && old_tail != new_tail) {
+ printk(KERN_ERR PR_PREFIX "Something went wrong, old tail = %d, new tail = %d\n",
+ old_tail, new_tail);
+ }
+}
+
+#ifdef _MIC_SCIF_
+/*
+ * TODO;
+ * Maybe move the logic into slow interrupt handler
+ */
+static irqreturn_t
+dma_interrupt_handler(int irq, void *dev_id)
+{
+ struct dma_channel *chan = ((struct dma_channel*)dev_id);
+
+ ack_dma_interrupt(chan);
+ mic_dma_lib_interrupt_handler(chan);
+
+ return IRQ_HANDLED;
+}
+#else
+
+#define SBOX_SICR0_DMA(x) (((x) >> 8) & 0xff)
+
+/*
+ * TODO;
+ * Maybe move the logic into slow interrupt handler
+ */
+void
+host_dma_interrupt_handler(mic_dma_handle_t dma_handle, uint32_t sboxSicr0reg)
+{
+ struct mic_dma_ctx_t *dma_ctx = (struct mic_dma_ctx_t *) dma_handle;
+ uint32_t dma_chan_id;
+ struct dma_channel *ch;
+
+ for (dma_chan_id = 0; dma_chan_id < 8; dma_chan_id++) {
+ if (SBOX_SICR0_DMA(sboxSicr0reg) & (0x1 << dma_chan_id)) {
+ ch = &dma_ctx->dma_channels[dma_chan_id];
+ if (ch->desc_ring)
+ host_dma_lib_interrupt_handler(ch);
+ }
+ }
+}
+
+void
+host_dma_lib_interrupt_handler(struct dma_channel *chan)
+{
+ ack_dma_interrupt(chan);
+ mic_dma_lib_interrupt_handler(chan);
+}
+#endif
+
+static void
+mi_mic_dma_chan_setup(struct dma_channel *ch, struct mic_dma_ctx_t *dma_ctx)
+{
+ ch->next_write_index = ch->chan->cached_tail;
+
+ init_ring(&ch->poll_ring, MAX_POLLING_BUFFERS, dma_ctx->device_num);
+
+ ch->intr_ring.comp_cb_array =
+ kzalloc(sizeof(*ch->intr_ring.comp_cb_array) * NUM_COMP_BUFS, GFP_KERNEL);
+ init_ring(&ch->intr_ring.ring, NUM_COMP_BUFS, dma_ctx->device_num);
+ ch->intr_ring.old_tail = 0;
+}
+
+static void
+mi_mic_dma_chan_destroy(struct dma_channel *ch, struct mic_dma_ctx_t *dma_ctx)
+{
+ uninit_ring(&ch->intr_ring.ring, dma_ctx->device_num);
+ kfree(ch->intr_ring.comp_cb_array);
+ uninit_ring(&ch->poll_ring, dma_ctx->device_num);
+}
+
+int
+open_dma_device(int device_num, uint8_t *mmio_va_base, mic_dma_handle_t* dma_handle)
+{
+ int result = 0;
+
+ if (device_num >= MAX_BOARD_SUPPORTED)
+ return -EINVAL;
+
+ mutex_lock(&lock_dma_dev_init[device_num]);
+ if (!mic_dma_context[device_num]) {
+ mic_dma_context[device_num] = kzalloc(sizeof(struct mic_dma_ctx_t), GFP_KERNEL);
+ BUG_ON(!mic_dma_context[device_num]);
+
+ mic_dma_context[device_num]->device_num = device_num;
+
+ result = mic_dma_lib_init(mmio_va_base, mic_dma_context[device_num]);
+ BUG_ON(result);
+ }
+
+ atomic_inc(&mic_dma_context[device_num]->ref_count);
+ *dma_handle = mic_dma_context[device_num];
+ mutex_unlock(&lock_dma_dev_init[device_num]);
+
+ return result;
+}
+EXPORT_SYMBOL(open_dma_device);
+
+void
+close_dma_device(int device_num, mic_dma_handle_t *dma_handle)
+{
+ struct mic_dma_ctx_t *dma_ctx;
+
+ if (device_num >= MAX_BOARD_SUPPORTED)
+ return;
+
+ mutex_lock(&lock_dma_dev_init[device_num]);
+ dma_ctx = (struct mic_dma_ctx_t *) *dma_handle;
+ if (dma_ctx &&
+ atomic_read(&dma_ctx->ref_count) &&
+ atomic_dec_and_test(&dma_ctx->ref_count)) {
+ mic_dma_lib_uninit(dma_ctx);
+ mic_dma_context[dma_ctx->device_num] = 0;
+ *dma_handle = NULL;
+ kfree(dma_ctx);
+ }
+ mutex_unlock(&lock_dma_dev_init[device_num]);
+}
+EXPORT_SYMBOL(close_dma_device);
+
+void mi_mic_dma_chan_set_dstat_wb(struct mic_dma_ctx_t *dma_ctx,
+ struct md_mic_dma_chan *chan)
+{
+#ifndef _MIC_SCIF_
+ struct pci_dev *pdev;
+#endif
+ if (!chan->dstat_wb_phys) {
+ chan->dstat_wb_loc = kzalloc(sizeof(uint32_t), GFP_KERNEL);
+
+#ifdef _MIC_SCIF_
+ chan->dstat_wb_phys = virt_to_phys(chan->dstat_wb_loc);
+#else
+ micscif_pci_dev(dma_ctx->device_num, &pdev);
+ chan->dstat_wb_phys = mic_map_single(dma_ctx->device_num - 1, pdev, chan->dstat_wb_loc,
+ sizeof(uint32_t));
+ BUG_ON(pci_dma_mapping_error(pdev, chan->dstat_wb_phys));
+#endif
+ }
+ md_mic_dma_chan_set_dstat_wb(&dma_ctx->dma_dev, chan);
+}
+
+void
+md_mic_dma_chan_setup(struct mic_dma_ctx_t *dma_ctx, struct dma_channel *ch)
+{
+ md_mic_dma_chan_unmask_intr(&dma_ctx->dma_dev, ch->chan);
+
+ /*
+ * Disable the channel, update desc ring base and size, write new head
+ * and then enable the channel.
+ */
+ if (mic_hw_family(ch->dma_ctx->device_num) == FAMILY_KNC &&
+ mic_hw_stepping(ch->dma_ctx->device_num) >= KNC_B0_STEP) {
+ mi_mic_dma_chan_set_dstat_wb(dma_ctx, ch->chan);
+ md_mic_dma_chan_set_dcherr_msk(&dma_ctx->dma_dev, ch->chan, 0);
+ }
+ md_mic_dma_chan_set_desc_ring(&dma_ctx->dma_dev, ch->chan,
+ ch->desc_ring_phys,
+ DMA_DESC_RING_SIZE);
+
+ wmb();
+
+ md_mic_dma_chan_unmask_intr(&dma_ctx->dma_dev, ch->chan);
+}
+
+int
+mic_dma_lib_init(uint8_t *mmio_va_base, struct mic_dma_ctx_t *dma_ctx)
+{
+ int i;
+#ifdef _MIC_SCIF_
+ int ret_value;
+#endif
+ struct dma_channel *ch;
+ enum md_mic_dma_chan_owner owner, currentOwner;
+
+ //pr_debug(PR_PREFIX "Initialized the dma mmio va=%p\n", mmio_va_base);
+ // Using this to check where the DMA lib is at for now.
+ currentOwner = mmio_va_base == 0 ? MIC_DMA_CHAN_MIC_OWNED : MIC_DMA_CHAN_HOST_OWNED;
+
+ // TODO: multi-card support
+ md_mic_dma_init(&dma_ctx->dma_dev, mmio_va_base);
+
+ for (i = 0 ; i < MAX_NUM_DMA_CHAN; i++) {
+ ch = &dma_ctx->dma_channels[i];
+
+ /* Initialize pointer to parent */
+ ch->dma_ctx = dma_ctx;
+
+ owner = i > __LAST_HOST_CHAN_NUM ? MIC_DMA_CHAN_MIC_OWNED
+ : MIC_DMA_CHAN_HOST_OWNED;
+
+ // This has to be done from card side
+ ch->chan = md_mic_dma_request_chan(&dma_ctx->dma_dev, owner);
+ KASSERT((ch->chan != NULL), "dummy\n");
+ ch->ch_num = ch->chan->ch_num;
+
+#ifdef _MIC_SCIF_
+ /*
+ * Host driver would have executed by now and thus setup the
+ * desc. ring
+ */
+ if (ch->chan->owner == MIC_DMA_CHAN_HOST_OWNED)
+ md_mic_dma_enable_chan(&dma_ctx->dma_dev, i, true);
+#endif
+
+ atomic_set(&(ch->flags), CHAN_INUSE); // Mark as used by default
+ if (currentOwner == owner) {
+ alloc_dma_desc_ring_mem(ch, dma_ctx);
+
+#ifdef _MIC_SCIF_ // DMA now shares the IRQ handler with other system interrupts
+ ret_value = request_irq(i, dma_interrupt_handler, IRQF_DISABLED,
+ "dma channel", ch);
+ ret_value = ret_value;
+ //pr_debug(PR_PREFIX "Interrupt handler ret value for chan %d = %d\n", i, ret_value);
+#endif
+ md_mic_dma_chan_setup(dma_ctx, ch);
+
+ mi_mic_dma_chan_setup(ch, dma_ctx);
+
+ init_waitqueue_head(&ch->intr_wq);
+ init_waitqueue_head(&ch->access_wq);
+ // Only mark owned channel to be available
+ atomic_set(&(ch->flags), CHAN_AVAILABLE);
+ md_mic_dma_print_debug(&dma_ctx->dma_dev, ch->chan);
+ } else {
+ ch->desc_ring = NULL;
+ }
+ }
+
+ /* Initialize last_allocated_dma_channel */
+ dma_ctx->last_allocated_dma_channel_num = -1;
+ //pr_debug(PR_PREFIX "Initialized the dma channels\n");
+ mic_dma_proc_init(dma_ctx);
+ return 0;
+}
+
+void
+mic_dma_lib_uninit(struct mic_dma_ctx_t *dma_ctx)
+{
+ int i;
+ struct dma_channel *ch;
+#ifndef _MIC_SCIF_
+ struct pci_dev *pdev;
+#endif
+
+ mic_dma_proc_uninit(dma_ctx);
+ for (i = 0 ; i < MAX_NUM_DMA_CHAN; i++) {
+ ch = &dma_ctx->dma_channels[i];
+ if (!ch->desc_ring)
+ continue;
+ drain_dma_intr(ch);
+ /* Request the channel but don't free it. Errors are okay */
+ request_dma_channel(ch);
+#ifdef _MIC_SCIF_ // DMA now shares the IRQ handler with other system interrupts
+ free_irq(i, ch);
+#endif
+ mi_mic_dma_chan_destroy(ch, dma_ctx);
+#ifndef _MIC_SCIF_
+ micscif_pci_dev(dma_ctx->device_num, &pdev);
+ mic_unmap_single(dma_ctx->device_num - 1, pdev, ch->desc_ring_phys,
+ (DMA_DESC_RING_SIZE * sizeof(*ch->desc_ring)) + PAGE_SIZE);
+#endif
+
+ kfree(ch->desc_ring_bak);
+ ch->desc_ring_bak = NULL;
+ ch->desc_ring = NULL;
+ if (mic_hw_family(ch->dma_ctx->device_num) == FAMILY_KNC &&
+ mic_hw_stepping(dma_ctx->device_num) >= KNC_B0_STEP) {
+#ifndef _MIC_SCIF_
+ mic_unmap_single(dma_ctx->device_num - 1, pdev, ch->chan->dstat_wb_phys,
+ sizeof(uint32_t));
+#endif
+ kfree(ch->chan->dstat_wb_loc);
+ ch->chan->dstat_wb_loc = NULL;
+ ch->chan->dstat_wb_phys = 0;
+ }
+ md_mic_dma_free_chan(&dma_ctx->dma_dev, ch->chan);
+ }
+#ifndef MIC_IS_EMULATION
+ /* Ensure that all waiters for DMA channels time out */
+ msleep(DMA_TO/HZ * 1000);
+#endif
+ md_mic_dma_uninit(&dma_ctx->dma_dev);
+ //pr_debug(PR_PREFIX "Uninitialized the dma channels\n");
+}
+
+/*
+ * reserve_dma_channel - reserve a given dma channel for exclusive use
+ *
+ * @dma_handle - handle to DMA device returned by open_dma_device
+ * @chan_num - Channel number to be reserved
+ * @chan - set to point to the dma channel reserved by the call
+ *
+ * Returns < 1 on error (errorno)
+ * Returns 0 on success
+ *
+ * NOTES: Should this function sleep waiting for the lock?
+ * TODO:
+ * Maybe there should be a blocking and non-blocking versions of this function
+ */
+int
+reserve_dma_channel(mic_dma_handle_t dma_handle, int chan_num, struct dma_channel **chan)
+{
+ struct mic_dma_ctx_t *dma_ctx = (struct mic_dma_ctx_t *) dma_handle;
+
+ /*
+ * Do we need to do acquire the lock for statically allocated channels?
+ * I am assuming we dont have to lock
+ */
+ if (CHAN_AVAILABLE == atomic_cmpxchg(&(dma_ctx->dma_channels[chan_num].flags),
+ CHAN_AVAILABLE, CHAN_INUSE)) {
+ *chan = &dma_ctx->dma_channels[chan_num];
+ return 0;
+ }
+ return -1;
+}
+EXPORT_SYMBOL(reserve_dma_channel);
+
+/*
+ * allocate_dma_channel - dynamically allocate a dma channel (for a short while). Will
+ * search for, choose, and lock down one channel for use by the calling thread.
+ *
+ * @dma_handle - handle to DMA device returned by open_dma_device
+ * @chan - Returns the dma_channel pointer that was allocated by the call
+ *
+ * Returns < 1 on error
+ * Returns 0 on success
+ *
+ * NOTE: This function grabs a lock before exiting -- the calling thread MUST NOT
+ * sleep, and must call free_dma_channel before returning to user-space or switching
+ * volantarily to another thread. Similarly, this function cannot be called from
+ * an interrupt context at this time.
+ *
+ * TODO: How do we pick a dma channel?
+ * For now I am doing it in round robin fashion.
+ */
+int
+allocate_dma_channel(mic_dma_handle_t dma_handle, struct dma_channel **chan)
+{
+ struct mic_dma_ctx_t *dma_ctx = (struct mic_dma_ctx_t *) dma_handle;
+ int i, j;
+
+ if (!dma_ctx)
+ return -ENODEV;
+
+ j = dma_ctx->last_allocated_dma_channel_num + 1;
+
+ for (i = 0; i < MAX_NUM_DMA_CHAN; i++, j++) {
+ if (CHAN_AVAILABLE == atomic_cmpxchg(&(dma_ctx->dma_channels[j %
+ MAX_NUM_DMA_CHAN].flags),
+ CHAN_AVAILABLE, CHAN_INUSE)) {
+ *chan = &(dma_ctx->dma_channels[j % MAX_NUM_DMA_CHAN]);
+ dma_ctx->last_allocated_dma_channel_num = j % MAX_NUM_DMA_CHAN;
+ return 0;
+ }
+ }
+ return -1;
+}
+EXPORT_SYMBOL(allocate_dma_channel);
+
+/*
+ * request_dma_channel - Request a specific DMA channel.
+ *
+ * @chan - Returns the dma_channel pointer that was requested
+ *
+ * Returns: 0 on success and -ERESTARTSYS if the wait was interrupted
+ * or -EBUSY if the channel was not available.
+ *
+ * NOTE: This function must call free_dma_channel before returning to
+ * user-space.
+ */
+int request_dma_channel(struct dma_channel *chan)
+{
+ int ret;
+
+ ret = wait_event_interruptible_timeout(chan->access_wq,
+ CHAN_AVAILABLE == atomic_cmpxchg(&chan->flags,
+ CHAN_AVAILABLE, CHAN_INUSE), DMA_TO);
+ if (!ret) {
+ printk(KERN_ERR "%s %d TO chan 0x%x\n", __func__, __LINE__, chan->ch_num);
+ ret = -EBUSY;
+ }
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+EXPORT_SYMBOL(request_dma_channel);
+
+/*
+ * free_dma_channel - after allocating a channel, used to
+ * free the channel after DMAs are submitted
+ *
+ * @chan - pointer to the dma_channel struct that was allocated
+ *
+ * Returns 0 on success, < 1 on error (errorno)
+ *
+ * NOTE: This function must be called after all do_dma calls are finished,
+ * but can be called before the DMAs actually complete (as long as the comp_cb()
+ * handler in do_dma don't refer to the dma_channel struct). If called with a
+ * dynamically allocated dma_chan, the caller must be the thread that called
+ * allocate_dma_chan. When operating on a dynamic channel, free unlocks the
+ * mutex locked in allocate. Statically allocated channels cannot be freed,
+ * and calling this function with that type of channel will return an error.
+ */
+int
+free_dma_channel(struct dma_channel *chan)
+{
+ /*
+ * Why can't we use this function with channels that were statically allocated??
+ */
+ BUG_ON(CHAN_INUSE !=
+ atomic_cmpxchg(&chan->flags, CHAN_INUSE, CHAN_AVAILABLE));
+ wake_up(&chan->access_wq);
+ return 0;
+}
+EXPORT_SYMBOL(free_dma_channel);
+
+static __always_inline uint32_t
+get_dma_tail_pointer(struct dma_channel *chan)
+{
+ struct mic_dma_device *dma_dev;
+ dma_dev = &chan->dma_ctx->dma_dev;
+ return md_mic_dma_chan_read_tail(dma_dev, chan->chan);
+}
+/*
+ * Return -1 in case of error
+ */
+static int
+program_memcpy_descriptors(struct dma_channel *chan, uint64_t src, uint64_t dst, size_t len)
+{
+ size_t current_transfer_len;
+ bool is_astep = false;
+ unsigned long ts = jiffies;
+
+ if (mic_hw_family(chan->dma_ctx->device_num) == FAMILY_KNC) {
+ if (mic_hw_stepping(chan->dma_ctx->device_num) == KNC_A_STEP)
+ is_astep = true;
+ } else {
+ is_astep = true;
+ }
+ do {
+ current_transfer_len = (len > MAX_DMA_XFER_SIZE) ?
+ MAX_DMA_XFER_SIZE : len;
+
+ ts = jiffies;
+ while (!md_avail_desc_ring_space(&chan->dma_ctx->dma_dev, is_astep, chan->chan,
+ (uint32_t)chan->next_write_index, 1)) {
+ if (time_after(jiffies,ts + DMA_TO)) {
+ printk(KERN_ERR "%s %d TO chan 0x%x\n", __func__, __LINE__, chan->ch_num);
+ return -ENOMEM;
+ }
+ }
+
+ //pr_debug("src_phys=0x%llx, dst_phys=0x%llx, size=0x%zx\n", src_phys_addr, dst_phys_addr, current_transfer_len);
+ md_mic_dma_memcpy_desc(&chan->desc_ring[chan->next_write_index],
+ src, dst, current_transfer_len);
+ chan->next_write_index = incr_rb_index((int)chan->next_write_index,
+ chan->chan->num_desc_in_ring);
+ len -= current_transfer_len;
+ dst = dst + current_transfer_len;
+ src = src + current_transfer_len;
+ } while(len > 0);
+
+ return 0;
+}
+
+/*
+ * do_dma - main dma function: perform a dma memcpy, len bytes from src to dst
+ *
+ * @chan - DMA channel to use for the transfer. The channel can be allocated
+ * dynamically by calling allocate_dma_chan, or statically by
+ * reserve_dma_chan. Using a channel not allocated in this way will
+ * result in undefined behavior.
+ * @flags - ATOMIC, called from an interrupt context (no blocking)
+ * @src - src physical address
+ * @dst - dst physical address
+ * @len - Length of the dma
+ * @comp_cb - When the DMA is complete, the struct's function will be called. NOTE!
+ * comp_cb(cb_cookie) is called from an interrupt context, so the
+ * function must not sleep or block.
+ *
+ * TODO: Figure out proper value instead of -2
+ * Return < 0 on error
+ * Return = -2 copy was done successfully, no need to wait
+ * Return >= 0: DMA has been queued. Return value can be polled on for completion
+ * if DO_DMA_POLLING was sent in flags
+ * (poll cookie). An example (simplified w/ no error handling).
+ * int cookie = do_dma(...);
+ * while (poll_dma_completion(cookie) == 0);
+ * printf("DMA now complete\n");
+ */
+int
+do_dma(struct dma_channel *chan, int flags, uint64_t src,
+ uint64_t dst, size_t len, struct dma_completion_cb *comp_cb)
+{
+ /*
+ * TODO:
+ * Do we need to assert the ownership of channel??
+ */
+ int poll_ring_index = -1;
+ int intr_ring_index = -1;
+ uint32_t num_status_desc = 0;
+ bool is_astep = false;
+ unsigned long ts = jiffies;
+
+ might_sleep();
+ if (flags & DO_DMA_INTR && !comp_cb)
+ return -EINVAL;
+
+ if (!verify_next_write_index(chan))
+ return -ENODEV;
+
+ //pr_debug(PR_PREFIX "Current transfer src = 0x%llx,dst = 0x%llx, len = 0x%zx\n", src, dst, len);
+ if (flags & DO_DMA_INTR) {
+ int err;
+ err = wait_event_interruptible_timeout(chan->intr_wq,
+ (-1 != (intr_ring_index = allocate_buffer(&chan->intr_ring.ring))),
+ DMA_TO);
+ if (!err) {
+ printk(KERN_ERR "%s %d TO chan 0x%x\n", __func__, __LINE__, chan->ch_num);
+ err = -ENOMEM;
+ }
+ if (err > 0)
+ err = 0;
+ if (!err) {
+ chan->intr_ring.comp_cb_array[intr_ring_index] = comp_cb;
+ num_status_desc++;
+#ifdef CONFIG_MK1OM
+ num_status_desc++;
+#endif
+ } else {
+ return err;
+ }
+ //pr_debug(PR_PREFIX "INTR intr_ring_index=%d, chan_num=%lx\n", intr_ring_index, (chan - dma_channels));
+ }
+
+ if (flags & DO_DMA_POLLING) {
+ poll_ring_index = allocate_buffer(&chan->poll_ring);
+ if (-1 == poll_ring_index)
+ return -ENOMEM;
+ num_status_desc++;
+ //pr_debug(PR_PREFIX "polling poll_ring_index=%d\n", poll_ring_index);
+ }
+ if (len && -ENOMEM == program_memcpy_descriptors(chan, src, dst, len)) {
+ //pr_debug(PR_PREFIX "ERROR: do_dma: No available space from program_memcpy_descriptors\n");
+ return -ENOMEM;
+ }
+
+ if (mic_hw_family(chan->dma_ctx->device_num) == FAMILY_KNC) {
+ if (mic_hw_stepping(chan->dma_ctx->device_num) == KNC_A_STEP)
+ is_astep = true;
+ } else {
+ is_astep = true;
+ }
+
+ ts = jiffies;
+
+ while (num_status_desc && num_status_desc > md_avail_desc_ring_space(&chan->dma_ctx->dma_dev,
+ is_astep, chan->chan, (uint32_t)chan->next_write_index, num_status_desc)) {
+ if (time_after(jiffies,ts + DMA_TO)) {
+ printk(KERN_ERR "%s %d TO chan 0x%x\n", __func__, __LINE__, chan->ch_num);
+ return -ENOMEM;
+ }
+ //pr_debug(PR_PREFIX "ERROR: do_dma: No available space from md_avail_desc_ring_space\n");
+ }
+
+ if (flags & DO_DMA_POLLING) {
+ incr_head(&chan->poll_ring);
+ md_mic_dma_prep_status_desc(&chan->desc_ring[chan->next_write_index],
+ poll_ring_index,
+ chan->poll_ring.tail_phys,
+ false);
+ chan->next_write_index = incr_rb_index((int)chan->next_write_index,
+ chan->chan->num_desc_in_ring);
+ }
+
+ if (flags & DO_DMA_INTR) {
+ incr_head(&chan->intr_ring.ring);
+#ifdef CONFIG_MK1OM
+ md_mic_dma_prep_status_desc(&chan->desc_ring[chan->next_write_index],
+ intr_ring_index,
+ chan->intr_ring.ring.tail_phys,
+ false);
+ chan->next_write_index = incr_rb_index((int)chan->next_write_index,
+ chan->chan->num_desc_in_ring);
+#endif
+ md_mic_dma_prep_status_desc(&chan->desc_ring[chan->next_write_index],
+ intr_ring_index,
+ chan->intr_ring.ring.tail_phys,
+ true);
+ chan->next_write_index = incr_rb_index((int)chan->next_write_index,
+ chan->chan->num_desc_in_ring);
+ }
+
+ /*
+ * TODO:
+ * Maybe it is better if we update the head pointer for every descriptor??
+ */
+ md_mic_dma_chan_write_head(&chan->dma_ctx->dma_dev, chan->chan, (uint32_t)chan->next_write_index);
+ //pr_debug(PR_PREFIX "in HW chan->next_write_index=%lld\n", chan->next_write_index);
+
+ if (DO_DMA_POLLING & flags)
+ return poll_ring_index;
+ return 0;
+}
+EXPORT_SYMBOL(do_dma);
+
+/*
+ * poll_dma_completion - check if a DMA is complete
+ *
+ * @poll_cookie - value returned from do_dma
+ *
+ * Returns
+ * 0 -> DMA pending
+ * 1 -> DMA completed
+ *
+ * Note: This is mostly useful after calling do_dma with a NULL comp_cb parameter, as
+ * it will allow the caller to wait for DMA completion.
+ */
+int
+poll_dma_completion(int poll_cookie, struct dma_channel *chan)
+{
+ if (!chan)
+ return -EINVAL;
+ /*
+ * In case of interrupts the ISR runs and reads the value
+ * of the tail location. If we are polling then we need
+ * to read the value of the tail location before checking
+ * if the entry is processed.
+ */
+ chan->poll_ring.tail = read_tail(&chan->poll_ring);
+ return is_entry_processed(&chan->poll_ring, poll_cookie);
+}
+EXPORT_SYMBOL(poll_dma_completion);
+
+/*
+ * do_status_update: Update physical address location with the value provided.
+ * Ensures all previous DMA descriptors submitted on this DMA
+ * channel are executed.
+ * @chan - DMA channel to use for the transfer. The channel can be allocated
+ * dynamically by calling allocate_dma_channel, or statically by
+ * reserve_dma_channel. Using a channel not allocated in this way will
+ * result in undefined behavior.
+ * @phys - physical address
+ * @value - Value to be programmed
+ */
+int do_status_update(struct dma_channel *chan, uint64_t phys, uint64_t value)
+{
+ unsigned long ts = jiffies;
+ bool is_astep = false;
+
+ if (!verify_next_write_index(chan))
+ return -ENODEV;
+
+ if (mic_hw_family(chan->dma_ctx->device_num) == FAMILY_KNC) {
+ if (mic_hw_stepping(chan->dma_ctx->device_num) == KNC_A_STEP)
+ is_astep = true;
+ } else {
+ is_astep = true;
+ }
+ /*
+ * TODO:
+ * Do we need to assert the ownership of channel??
+ */
+ ts = jiffies;
+ while (!md_avail_desc_ring_space(&chan->dma_ctx->dma_dev,
+ is_astep, chan->chan, (uint32_t) chan->next_write_index, 1)) {
+ cpu_relax();
+ if (time_after(jiffies,ts + DMA_TO)) {
+ printk(KERN_ERR "%s %d TO chan 0x%x\n", __func__, __LINE__, chan->ch_num);
+ return -EBUSY;
+ }
+ }
+
+ md_mic_dma_prep_status_desc(&chan->desc_ring[chan->next_write_index],
+ value,
+ phys,
+ false);
+
+ chan->next_write_index = incr_rb_index((int)chan->next_write_index,
+ chan->chan->num_desc_in_ring);
+
+ md_mic_dma_chan_write_head(&chan->dma_ctx->dma_dev,
+ chan->chan, (uint32_t)chan->next_write_index);
+ return 0;
+}
+EXPORT_SYMBOL(do_status_update);
+
+/*
+ * get_dma_mark: Obtain current value of DMA mark
+ * @chan - DMA channel to use for the transfer. The channel can be allocated
+ * dynamically by calling allocate_dma_channel, or statically by
+ * reserve_dma_channel. Using a channel not allocated in this way will
+ * result in undefined behavior.
+ */
+int get_dma_mark(struct dma_channel *chan)
+{
+ if (chan)
+ return chan->intr_ring.ring.head;
+ else
+ return -1;
+}
+EXPORT_SYMBOL(get_dma_mark);
+
+/*
+ * program_dma_mark: Increment the current value of the DMA mark for a DMA channel
+ * and program an interrupt status update descriptor which ensures that all DMA
+ * descriptors programmed uptil this point in time are completed.
+ * @chan - DMA channel to use for the transfer. The channel can be allocated
+ * dynamically by calling allocate_dma_channel, or statically by
+ * reserve_dma_channel. Using a channel not allocated in this way will
+ * result in undefined behavior.
+ */
+int program_dma_mark(struct dma_channel *chan)
+{
+ /*
+ * TODO:
+ * Do we need to assert the ownership of channel??
+ */
+ int intr_ring_index;
+ int err;
+ unsigned long ts = jiffies;
+ uint32_t num_status_desc = 1;
+ bool is_astep = false;
+
+ if (!verify_next_write_index(chan))
+ return -ENODEV;
+
+ if (mic_hw_family(chan->dma_ctx->device_num) == FAMILY_KNC) {
+ if (mic_hw_stepping(chan->dma_ctx->device_num) == KNC_A_STEP)
+ is_astep = true;
+ } else {
+ is_astep = true;
+ }
+ might_sleep();
+ err = wait_event_interruptible_timeout(chan->intr_wq,
+ (-1 != (intr_ring_index = allocate_buffer(&chan->intr_ring.ring))),
+ DMA_TO);
+ if (!err)
+ err = -EBUSY;
+ if (err > 0)
+ err = 0;
+ if (err)
+ return err;
+
+#ifdef CONFIG_MK1OM
+ num_status_desc++;
+#endif
+ ts = jiffies;
+ while (num_status_desc > md_avail_desc_ring_space(&chan->dma_ctx->dma_dev,
+ is_astep, chan->chan, (uint32_t)chan->next_write_index, num_status_desc)) {
+ cpu_relax();
+ if (time_after(jiffies,ts + DMA_TO)) {
+ printk(KERN_ERR "%s %d TO chan 0x%x\n", __func__, __LINE__, chan->ch_num);
+ return -EBUSY;
+ }
+ }
+
+ chan->intr_ring.comp_cb_array[intr_ring_index] = NULL;
+
+ incr_head(&chan->intr_ring.ring);
+#ifdef CONFIG_MK1OM
+ md_mic_dma_prep_status_desc(&chan->desc_ring[chan->next_write_index],
+ intr_ring_index,
+ chan->intr_ring.ring.tail_phys,
+ false);
+ chan->next_write_index = incr_rb_index((int)chan->next_write_index,
+ chan->chan->num_desc_in_ring);
+#endif
+ md_mic_dma_prep_status_desc(&chan->desc_ring[chan->next_write_index],
+ intr_ring_index,
+ chan->intr_ring.ring.tail_phys,
+ true);
+ chan->next_write_index = incr_rb_index((int)chan->next_write_index,
+ chan->chan->num_desc_in_ring);
+
+ md_mic_dma_chan_write_head(&chan->dma_ctx->dma_dev, chan->chan, (uint32_t)chan->next_write_index);
+ return intr_ring_index;
+}
+EXPORT_SYMBOL(program_dma_mark);
+
+/*
+ * is_current_dma_mark: Check if the dma mark provided is the current DMA mark.
+ * @chan - DMA channel
+ * @mark - DMA mark
+ *
+ * Return true on success and false on failure.
+ */
+bool is_current_dma_mark(struct dma_channel *chan, int mark)
+{
+ return (get_dma_mark(chan) == mark);
+}
+EXPORT_SYMBOL(is_current_dma_mark);
+
+/*
+ * is_dma_mark_processed: Check if the dma mark provided has been processed.
+ * @chan - DMA channel
+ * @mark - DMA mark
+ *
+ * Return true on success and false on failure.
+ */
+bool is_dma_mark_processed(struct dma_channel *chan, int mark)
+{
+ return is_entry_processed(&chan->intr_ring.ring, mark);
+}
+EXPORT_SYMBOL(is_dma_mark_processed);
+
+/*
+ * dma_mark_wait:
+ * @chan - DMA channel
+ * @mark - DMA mark
+ * @is_interruptible - Use wait_event_interruptible() or not.
+ *
+ * Wait for the dma mark to complete.
+ * Return 0 on success and appropriate error value on error.
+ */
+int dma_mark_wait(struct dma_channel *chan, int mark, bool is_interruptible)
+{
+ int err = 0;
+ uint32_t prev_tail = 0, new_tail;
+ uint32_t count = 0;
+
+ if (chan) {
+ might_sleep();
+__retry:
+ if (is_interruptible)
+ err = wait_event_interruptible_timeout(
+ chan->intr_wq,
+ is_dma_mark_processed(chan, mark),
+ DMA_TO);
+ else
+ err = wait_event_timeout(chan->intr_wq,
+ is_dma_mark_processed(chan, mark), DMA_TO);
+
+ if (!err) { // 0 is timeout
+ new_tail = get_dma_tail_pointer(chan);
+ if ((count <= DMA_FENCE_TIMEOUT_CNT) &&
+ (!count || new_tail != prev_tail)) { // For performance, prev_tail is not read at the begining
+ prev_tail = new_tail;
+ count++;
+ pr_debug("DMA fence wating is still ongoing, waiting for %d seconds\n", DMA_TO/HZ *count);
+ goto __retry;
+ } else {
+ printk(KERN_ERR "%s %d TO chan 0x%x\n", __func__, __LINE__, chan->ch_num);
+ err = -EBUSY;
+ }
+ }
+ if (err > 0)
+ err = 0;
+ }
+ return err;
+}
+EXPORT_SYMBOL(dma_mark_wait);
+
+/*
+ * drain_dma_poll - Drain all outstanding DMA operations for a particular
+ * DMA channel via polling.
+ * @chan - DMA channel
+ * Return 0 on success and -errno on error.
+ */
+int drain_dma_poll(struct dma_channel *chan)
+{
+ int cookie, err;
+ unsigned long ts;
+ uint32_t prev_tail = 0, new_tail, count = 0;
+ if (chan) {
+ if ((err = request_dma_channel(chan)))
+ goto error;
+ if ((cookie = do_dma(chan,
+ DO_DMA_POLLING, 0, 0, 0, NULL)) < 0) {
+ err = cookie;
+ free_dma_channel(chan);
+ goto error;
+ }
+ free_dma_channel(chan);
+ ts = jiffies;
+ while (1 != poll_dma_completion(cookie, chan)) {
+ cpu_relax();
+ if (time_after(jiffies,ts + DMA_TO)) {
+ new_tail = get_dma_tail_pointer(chan);
+ if ((!count || new_tail != prev_tail) && (count <= DMA_FENCE_TIMEOUT_CNT)) {
+ prev_tail = new_tail;
+ ts = jiffies;
+ count++;
+ pr_debug("polling DMA is still ongoing, wating for %d seconds\n", DMA_TO/HZ * count);
+ } else {
+ err = -EBUSY;
+ break;
+ }
+ }
+ }
+error:
+ if (err)
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ } else {
+ err = -EINVAL;
+ }
+ return err;
+}
+EXPORT_SYMBOL(drain_dma_poll);
+
+/*
+ * drain_dma_intr - Drain all outstanding DMA operations for a particular
+ * DMA channel via interrupt based blocking wait.
+ * @chan - DMA channel
+ * Return 0 on success and -errno on error.
+ */
+int drain_dma_intr(struct dma_channel *chan)
+{
+ int cookie, err;
+
+ if (chan) {
+ if ((err = request_dma_channel(chan)))
+ goto error;
+ if ((cookie = program_dma_mark(chan)) < 0) {
+ err = cookie;
+ free_dma_channel(chan);
+ goto error;
+ }
+ free_dma_channel(chan);
+ err = dma_mark_wait(chan, cookie, false);
+error:
+ if (err)
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ } else {
+ err = -EINVAL;
+ }
+ return err;
+}
+EXPORT_SYMBOL(drain_dma_intr);
+
+/*
+ * drain_dma_global - Drain all outstanding DMA operations for
+ * all online DMA channel.
+ * Return none
+ */
+int drain_dma_global(mic_dma_handle_t dma_handle)
+{
+ int i, err = -EINVAL;
+ struct dma_channel *chan;
+ struct mic_dma_ctx_t *dma_ctx = (struct mic_dma_ctx_t *)dma_handle;
+
+ if (!dma_ctx)
+ return err;
+
+ might_sleep();
+ for (i = 0 ; i < MAX_NUM_DMA_CHAN; i++) {
+ chan = &dma_ctx->dma_channels[i];
+ if (chan->desc_ring == NULL)
+ continue;
+ if ((err = drain_dma_intr(chan)))
+ break;
+ }
+ return err;
+}
+EXPORT_SYMBOL(drain_dma_global);
+
+#ifdef _MIC_SCIF_
+/*
+ * dma_suspend: DMA tasks before transition to low power state.
+ * @dma_handle: Handle for a DMA driver context.
+ *
+ * Perform the following tasks before the device transitions
+ * to a low power state:
+ * 1) Store away the DMA descriptor ring physical address base for
+ * all DMA channels (both host/uOS owned) since the value would be
+ * required to reinitialize the DMA channels upon transition from
+ * low power to active state.
+ *
+ * Return: none
+ * Notes: Invoked only on MIC.
+ */
+void dma_suspend(mic_dma_handle_t dma_handle)
+{
+ int i;
+ struct dma_channel *ch;
+ struct mic_dma_ctx_t *dma_ctx = (struct mic_dma_ctx_t *)dma_handle;
+ struct mic_dma_device *dma_dev = &dma_ctx->dma_dev;
+
+ for (i = 0; i < MAX_NUM_DMA_CHAN; i++) {
+ ch = &dma_ctx->dma_channels[i];
+ ch->desc_ring_phys =
+ md_mic_dma_chan_get_desc_ring_phys(dma_dev, ch->chan);
+ ch->chan->dstat_wb_phys =
+ md_mic_dma_chan_get_dstatwb_phys(dma_dev, ch->chan);
+ }
+}
+EXPORT_SYMBOL(dma_suspend);
+
+/*
+ * dma_resume: DMA tasks after wake up from low power state.
+ * @dma_handle: Handle for a DMA driver context.
+ *
+ * Performs the following tasks before the device transitions
+ * from a low power state to active state:
+ * 1) As a test, reset the value in DMA configuration register.
+ * 2) Reset the next_write_index for the DMA descriptor ring to 0
+ * since the DMA channel will be reset shortly.
+ * 3) Reinitialize the DMA MD layer for the channel.
+ *
+ * Return: none
+ * Notes:
+ * Notes: Invoked only on MIC.
+ */
+void dma_resume(mic_dma_handle_t dma_handle)
+{
+ int i;
+ struct dma_channel *ch;
+ struct mic_dma_ctx_t *dma_ctx = (struct mic_dma_ctx_t *)dma_handle;
+ struct mic_dma_device *dma_dev = &dma_ctx->dma_dev;
+
+ /* TODO: Remove test write to SBOX_DCR */
+ mic_sbox_write_mmio(dma_dev->mm_sbox, SBOX_DCR, 0);
+ for (i = 0; i < MAX_NUM_DMA_CHAN; i++) {
+ ch = &dma_ctx->dma_channels[i];
+ ch->next_write_index = 0;
+ md_mic_dma_chan_init_attr(dma_dev, ch->chan);
+ md_mic_dma_chan_setup(dma_ctx, ch);
+ }
+}
+EXPORT_SYMBOL(dma_resume);
+
+#else
+
+/*
+ * dma_prep_suspend: DMA tasks required on host before a device can transition
+ * to a low power state.
+ * @dma_handle: Handle for a DMA driver context.
+ *
+ * Performs the following tasks on the host before the device can be allowed
+ * to transiti to a low power state.
+ * 1) Reset the next_Write_index for the DMA descriptor ring to 0
+ * since the DMA channel will be reset shortly. This is required primarily
+ * for Host owned DMA channels since MIC does not have access to this
+ * information.
+ * Return: none
+ * Invoked only on Host.
+ */
+void dma_prep_suspend(mic_dma_handle_t dma_handle)
+{
+ int i;
+ struct dma_channel *ch;
+ struct mic_dma_ctx_t *dma_ctx = (struct mic_dma_ctx_t *)dma_handle;
+
+ for (i = 0; i < MAX_NUM_DMA_CHAN; i++) {
+ ch = &dma_ctx->dma_channels[i];
+ ch->next_write_index = 0;
+ }
+}
+EXPORT_SYMBOL(dma_prep_suspend);
+#endif
+
+#ifdef CONFIG_PAGE_CACHE_DMA
+#ifdef _MIC_SCIF_
+static const struct dma_operations dma_operations_fast_copy = {
+ .do_dma = do_dma,
+ .poll_dma_completion = poll_dma_completion,
+ .free_dma_channel = free_dma_channel,
+ .open_dma_device = open_dma_device,
+ .close_dma_device = close_dma_device,
+ .allocate_dma_channel = allocate_dma_channel,
+ .program_descriptors = program_memcpy_descriptors,
+ .do_dma_polling = DO_DMA_POLLING,
+};
+
+static const struct file_dma fdma_callback = {
+ .dmaops = &dma_operations_fast_copy,
+};
+#endif
+#endif
+
+#ifdef _MIC_SCIF_
+static int
+#else
+int
+#endif
+mic_dma_init(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_BOARD_SUPPORTED; i++)
+ mutex_init (&lock_dma_dev_init[i]);
+#ifdef CONFIG_PAGE_CACHE_DMA
+#ifdef _MIC_SCIF_
+ register_dma_for_fast_copy(&fdma_callback);
+#endif
+#endif
+ return 0;
+}
+
+#ifdef _MIC_SCIF_
+static void mic_dma_uninit(void)
+{
+#ifdef CONFIG_PAGE_CACHE_DMA
+ unregister_dma_for_fast_copy();
+#endif
+}
+
+module_init(mic_dma_init);
+module_exit(mic_dma_uninit);
+#endif
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+static int
+mic_dma_proc_ring_show(struct seq_file *m, void *data)
+{
+ struct mic_dma_ctx_t *dma_ctx = m->private;
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(dma_ctx->device_num - 1);
+ int i, err;
+ struct compl_buf_ring *ring;
+
+ if ((err = micpm_get_reference(mic_ctx, true))) {
+ printk(KERN_ERR "%s %d: unable to get micpm reference: %d\n",
+ __func__, __LINE__, err);
+ return err;
+ }
+
+ seq_printf(m, "Intr rings\n");
+ seq_printf(m, "%-10s%-12s%-12s%-12s%-25s%-18s%-25s\n",
+ "Chan", "Head", "Tail", "Size", "Tail loc", "Actual tail", "In Use");
+ for (i = first_dma_chan(); i <= last_dma_chan(); i++) {
+ ring = &dma_ctx->dma_channels[i].intr_ring.ring;
+ seq_printf(m, "%-#10x%-#12x%-#12x%-#12x%-#25llx%-#18x%-#18x\n",
+ i, ring->head, ring->tail, ring->size,
+ ring->tail_location, *(int*)ring->tail_location,
+ atomic_read(&dma_ctx->dma_channels[i].flags));
+ }
+ seq_printf(m, "Poll rings\n");
+ seq_printf(m, "%-10s%-12s%-12s%-12s%-25s%-18s\n",
+ "Chan", "Head", "Tail", "Size", "Tail loc", "Actual tail");
+ for (i = first_dma_chan(); i <= last_dma_chan(); i++) {
+ ring = &dma_ctx->dma_channels[i].poll_ring;
+ seq_printf(m, "%-#10x%-#12x%-#12x%-#12x%-#25llx%-#18x\n",
+ i, ring->head, ring->tail, ring->size,
+ ring->tail_location, *(int*)ring->tail_location);
+ }
+ seq_printf(m, "Next_Write_Index\n");
+ seq_printf(m, "%-10s%-12s\n", "Chan", "Next_Write_Index");
+ for (i = 0; i < MAX_NUM_DMA_CHAN; i++) {
+ seq_printf(m, "%-#10x%-#12llx\n",
+ i, dma_ctx->dma_channels[i].next_write_index);
+ }
+ micpm_put_reference(mic_ctx);
+ return 0;
+}
+
+static int
+mic_dma_proc_ring_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, mic_dma_proc_ring_show, PDE_DATA(inode));
+}
+
+static int
+mic_dma_proc_reg_show(struct seq_file *m, void *data)
+{
+ int i, j, chan_num, size, dtpr, err;
+ struct mic_dma_ctx_t *dma_ctx = m->private;
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(dma_ctx->device_num - 1);
+ struct mic_dma_device *dma_dev = &dma_ctx->dma_dev;
+ struct dma_channel *curr_chan;
+ union md_mic_dma_desc desc;
+
+ if ((err = micpm_get_reference(mic_ctx, true))) {
+ printk(KERN_ERR "%s %d: unable to get micpm reference: %d\n",
+ __func__, __LINE__, err);
+ return err;
+ }
+
+ seq_printf(m, "========================================"
+ "=======================================\n");
+ seq_printf(m, "SBOX_DCR: %#x\n",
+ mic_sbox_read_mmio(dma_dev->mm_sbox, SBOX_DCR));
+ seq_printf(m, "DMA Channel Registers\n");
+ seq_printf(m, "========================================"
+ "=======================================\n");
+ seq_printf(m, "%-10s| %-10s %-10s %-10s %-10s %-10s %-10s"
+#ifdef CONFIG_MK1OM
+ " %-10s %-11s %-14s %-10s"
+#endif
+ "\n", "Channel", "DCAR", "DTPR", "DHPR",
+ "DRAR_HI", "DRAR_LO",
+#ifdef CONFIG_MK1OM
+ "DSTATWB_LO", "DSTATWB_HI", "DSTAT_CHERR", "DSTAT_CHERRMSK",
+#endif
+ "DSTAT");
+ seq_printf(m, "========================================"
+ "=======================================\n");
+
+#ifdef _MIC_SCIF_
+ for (i = 0; i < MAX_NUM_DMA_CHAN; i++) {
+#else
+ for (i = first_dma_chan(); i <= last_dma_chan(); i++) {
+#endif
+ curr_chan = &dma_ctx->dma_channels[i];
+ chan_num = curr_chan->ch_num;
+ seq_printf(m, "%-10i| %-#10x %-#10x %-#10x %-#10x"
+ " %-#10x"
+#ifdef CONFIG_MK1OM
+ " %-#10x %-#11x %-#10x %-#14x"
+#endif
+ " %-#10x\n", chan_num,
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCAR),
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DTPR),
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DHPR),
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_HI),
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_LO),
+#ifdef CONFIG_MK1OM
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTATWB_LO),
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTATWB_HI),
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCHERR),
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCHERRMSK),
+#endif
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTAT));
+ }
+
+ seq_printf(m, "\nDMA Channel Descriptor Rings\n");
+ seq_printf(m, "========================================"
+ "=======================================\n");
+
+ for (i = first_dma_chan(); i <= last_dma_chan(); i++) {
+ curr_chan = &dma_ctx->dma_channels[i];
+ chan_num = curr_chan->ch_num;
+ dtpr = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DTPR);
+ seq_printf(m, "Channel %i: [", chan_num);
+ size = ((int) md_mic_dma_read_mmio(dma_dev, chan_num, REG_DHPR)
+ - dtpr) % curr_chan->chan->num_desc_in_ring;
+ /*
+ * In KNC B0, empty condition is tail = head -1
+ */
+ if (mic_hw_family(dma_ctx->device_num) == FAMILY_KNC &&
+ mic_hw_stepping(dma_ctx->device_num) >= KNC_B0_STEP)
+ size -= 1;
+
+ for (j = 0; j < size; j++) {
+ desc = curr_chan->desc_ring[(j+dtpr) %
+ curr_chan->chan->num_desc_in_ring];
+
+ switch (desc.desc.nop.type){
+ case NOP:
+ seq_printf(m," {Type: NOP, 0x%#llx"
+ " %#llx} ", desc.qwords.qw0,
+ desc.qwords.qw1);
+ case MEMCOPY:
+ seq_printf(m," {Type: MEMCOPY, SAP:"
+ " 0x%#llx, DAP: %#llx, length: %#llx} ",
+ (uint64_t) desc.desc.memcopy.sap,
+ (uint64_t) desc.desc.memcopy.dap,
+ (uint64_t) desc.desc.memcopy.length);
+ break;
+ case STATUS:
+ seq_printf(m," {Type: STATUS, data:"
+ " 0x%#llx, DAP: %#llx, intr: %lli} ",
+ (uint64_t) desc.desc.status.data,
+ (uint64_t) desc.desc.status.dap,
+ (uint64_t) desc.desc.status.intr);
+ break;
+ case GENERAL:
+ seq_printf(m," {Type: GENERAL, "
+ "DAP: %#llx, dword: %#llx} ",
+ (uint64_t) desc.desc.general.dap,
+ (uint64_t) desc.desc.general.data);
+ break;
+ case KEYNONCECNT:
+ seq_printf(m," {Type: KEYNONCECNT, sel: "
+ "%lli, h: %lli, index: %lli, cs: %lli,"
+ " value: %#llx} ",
+ (uint64_t) desc.desc.keynoncecnt.sel,
+ (uint64_t) desc.desc.keynoncecnt.h,
+ (uint64_t) desc.desc.keynoncecnt.index,
+ (uint64_t) desc.desc.keynoncecnt.cs,
+ (uint64_t) desc.desc.keynoncecnt.data);
+ break;
+ case KEY:
+ seq_printf(m," {Type: KEY, dest_ind"
+ "ex: %lli, ski: %lli, skap: %#llx ",
+ (uint64_t) desc.desc.key.di,
+ (uint64_t) desc.desc.key.ski,
+ (uint64_t) desc.desc.key.skap);
+ break;
+ default:
+ seq_printf(m," {Uknown Type=%lli ,"
+ "%#llx %#llx} ",(uint64_t) desc.desc.nop.type,
+ (uint64_t) desc.qwords.qw0,
+ (uint64_t) desc.qwords.qw1);
+ }
+ }
+ seq_printf(m, "]\n");
+ if (mic_hw_family(dma_ctx->device_num) == FAMILY_KNC &&
+ mic_hw_stepping(dma_ctx->device_num) >= KNC_B0_STEP &&
+ curr_chan->chan->dstat_wb_loc)
+ seq_printf(m, "DSTAT_WB = 0x%x\n",
+ *((uint32_t*)curr_chan->chan->dstat_wb_loc));
+ }
+ micpm_put_reference(mic_ctx);
+
+ return 0;
+}
+
+static int
+mic_dma_proc_reg_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, mic_dma_proc_reg_show, PDE_DATA(inode));
+}
+
+struct file_operations micdma_ring_fops = {
+ .open = mic_dma_proc_ring_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+struct file_operations micdma_reg_fops = {
+ .open = mic_dma_proc_reg_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void
+mic_dma_proc_init(struct mic_dma_ctx_t *dma_ctx)
+{
+ char name[64];
+
+ snprintf(name, 63, "%s%d", proc_dma_ring, dma_ctx->device_num);
+ if (!proc_create_data(name, S_IFREG | S_IRUGO, NULL, &micdma_ring_fops, dma_ctx))
+ printk("micdma: unable to register /proc/%s\n", name);
+
+ snprintf(name, 63, "%s%d", proc_dma_reg, dma_ctx->device_num);
+ if (!proc_create_data(name, S_IFREG | S_IRUGO, NULL, &micdma_reg_fops, dma_ctx))
+ printk("micdma: unable to register /proc/%s\n", name);
+
+}
+#else // LINUX VERSION
+static int
+mic_dma_proc_read_fn(char *buf, char **start, off_t offset, int count, int *eof, void *data)
+{
+ struct mic_dma_ctx_t *dma_ctx = data;
+ int i, len = 0;
+ struct compl_buf_ring *ring;
+
+ len += sprintf(buf + len, "Intr rings\n");
+ len += sprintf(buf + len, "%-10s%-12s%-12s%-12s%-25s%-18s%-25s\n",
+ "Chan", "Head", "Tail", "Size", "Tail loc", "Actual tail", "In Use");
+ for (i = first_dma_chan(); i <= last_dma_chan(); i++) {
+ ring = &dma_ctx->dma_channels[i].intr_ring.ring;
+ len += sprintf(buf + len, "%-#10x%-#12x%-#12x%-#12x%-#25llx%-#18x%-#18x\n",
+ i, ring->head, ring->tail, ring->size,
+ ring->tail_location, *(int*)ring->tail_location,
+ atomic_read(&dma_ctx->dma_channels[i].flags));
+ }
+ len += sprintf(buf + len, "Poll rings\n");
+ len += sprintf(buf + len, "%-10s%-12s%-12s%-12s%-25s%-18s\n",
+ "Chan", "Head", "Tail", "Size", "Tail loc", "Actual tail");
+ for (i = first_dma_chan(); i <= last_dma_chan(); i++) {
+ ring = &dma_ctx->dma_channels[i].poll_ring;
+ len += sprintf(buf + len, "%-#10x%-#12x%-#12x%-#12x%-#25llx%-#18x\n",
+ i, ring->head, ring->tail, ring->size,
+ ring->tail_location, *(int*)ring->tail_location);
+ }
+ len += sprintf(buf + len, "Next_Write_Index\n");
+ len += sprintf(buf + len, "%-10s%-12s\n", "Chan", "Next_Write_Index");
+ for (i = 0; i < MAX_NUM_DMA_CHAN; i++) {
+ len += sprintf(buf + len, "%-#10x%-#12llx\n",
+ i, dma_ctx->dma_channels[i].next_write_index);
+ }
+ return len;
+}
+
+static int
+mic_dma_proc_read_registers_fn(char *buf, char **start, off_t offset, int count,
+ int *eof, void *data)
+{
+ int i, j, chan_num, size, dtpr, len = 0;
+ struct mic_dma_ctx_t *dma_ctx = data;
+ struct mic_dma_device *dma_dev = &dma_ctx->dma_dev;
+ struct dma_channel *curr_chan;
+ union md_mic_dma_desc desc;
+
+ len += sprintf(buf + len, "========================================"
+ "=======================================\n");
+ len += sprintf(buf + len, "SBOX_DCR: %#x\n",
+ mic_sbox_read_mmio(dma_dev->mm_sbox, SBOX_DCR));
+ len += sprintf(buf + len, "DMA Channel Registers\n");
+ len += sprintf(buf + len, "========================================"
+ "=======================================\n");
+ len += sprintf(buf + len, "%-10s| %-10s %-10s %-10s %-10s %-10s %-10s"
+#ifdef CONFIG_MK1OM
+ " %-10s %-11s %-14s %-10s"
+#endif
+ "\n", "Channel", "DCAR", "DTPR", "DHPR",
+ "DRAR_HI", "DRAR_LO",
+#ifdef CONFIG_MK1OM
+ "DSTATWB_LO", "DSTATWB_HI", "DSTAT_CHERR", "DSTAT_CHERRMSK",
+#endif
+ "DSTAT");
+ len += sprintf(buf + len, "========================================"
+ "=======================================\n");
+
+#ifdef _MIC_SCIF_
+ for (i = 0; i < MAX_NUM_DMA_CHAN; i++) {
+#else
+ for (i = first_dma_chan(); i <= last_dma_chan(); i++) {
+#endif
+ curr_chan = &dma_ctx->dma_channels[i];
+ chan_num = curr_chan->ch_num;
+ len += sprintf(buf + len, "%-10i| %-#10x %-#10x %-#10x %-#10x"
+ " %-#10x"
+#ifdef CONFIG_MK1OM
+ " %-#10x %-#11x %-#10x %-#14x"
+#endif
+ " %-#10x\n", chan_num,
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCAR),
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DTPR),
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DHPR),
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_HI),
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_LO),
+#ifdef CONFIG_MK1OM
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTATWB_LO),
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTATWB_HI),
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCHERR),
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCHERRMSK),
+#endif
+ md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTAT));
+ }
+
+ len += sprintf(buf + len, "\nDMA Channel Descriptor Rings\n");
+ len += sprintf(buf + len, "========================================"
+ "=======================================\n");
+
+ for (i = first_dma_chan(); i <= last_dma_chan(); i++) {
+ curr_chan = &dma_ctx->dma_channels[i];
+ chan_num = curr_chan->ch_num;
+ dtpr = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DTPR);
+ len += sprintf(buf + len, "Channel %i: [", chan_num);
+ size = ((int) md_mic_dma_read_mmio(dma_dev, chan_num, REG_DHPR)
+ - dtpr) % curr_chan->chan->num_desc_in_ring;
+ /*
+ * In KNC B0, empty condition is tail = head -1
+ */
+ if (mic_hw_family(dma_ctx->device_num) == FAMILY_KNC &&
+ mic_hw_stepping(dma_ctx->device_num) >= KNC_B0_STEP)
+ size -= 1;
+
+ for (j = 0; j < size; j++) {
+ desc = curr_chan->desc_ring[(j+dtpr) %
+ curr_chan->chan->num_desc_in_ring];
+
+ switch (desc.desc.nop.type){
+ case NOP:
+ len += sprintf(buf + len," {Type: NOP, 0x%#llx"
+ " %#llx} ", desc.qwords.qw0,
+ desc.qwords.qw1);
+ case MEMCOPY:
+ len += sprintf(buf + len," {Type: MEMCOPY, SAP:"
+ " 0x%#llx, DAP: %#llx, length: %#llx} ",
+ (uint64_t) desc.desc.memcopy.sap,
+ (uint64_t) desc.desc.memcopy.dap,
+ (uint64_t) desc.desc.memcopy.length);
+ break;
+ case STATUS:
+ len += sprintf(buf + len," {Type: STATUS, data:"
+ " 0x%#llx, DAP: %#llx, intr: %lli} ",
+ (uint64_t) desc.desc.status.data,
+ (uint64_t) desc.desc.status.dap,
+ (uint64_t) desc.desc.status.intr);
+ break;
+ case GENERAL:
+ len += sprintf(buf + len," {Type: GENERAL, "
+ "DAP: %#llx, dword: %#llx} ",
+ (uint64_t) desc.desc.general.dap,
+ (uint64_t) desc.desc.general.data);
+ break;
+ case KEYNONCECNT:
+ len += sprintf(buf + len," {Type: KEYNONCECNT, sel: "
+ "%lli, h: %lli, index: %lli, cs: %lli,"
+ " value: %#llx} ",
+ (uint64_t) desc.desc.keynoncecnt.sel,
+ (uint64_t) desc.desc.keynoncecnt.h,
+ (uint64_t) desc.desc.keynoncecnt.index,
+ (uint64_t) desc.desc.keynoncecnt.cs,
+ (uint64_t) desc.desc.keynoncecnt.data);
+ break;
+ case KEY:
+ len += sprintf(buf + len," {Type: KEY, dest_ind"
+ "ex: %lli, ski: %lli, skap: %#llx ",
+ (uint64_t) desc.desc.key.di,
+ (uint64_t) desc.desc.key.ski,
+ (uint64_t) desc.desc.key.skap);
+ break;
+ default:
+ len += sprintf(buf + len," {Uknown Type=%lli ,"
+ "%#llx %#llx} ",(uint64_t) desc.desc.nop.type,
+ (uint64_t) desc.qwords.qw0,
+ (uint64_t) desc.qwords.qw1);
+ }
+ }
+ len += sprintf(buf + len, "]\n");
+ if (mic_hw_family(dma_ctx->device_num) == FAMILY_KNC &&
+ mic_hw_stepping(dma_ctx->device_num) >= KNC_B0_STEP &&
+ curr_chan->chan->dstat_wb_loc)
+ len += sprintf(buf + len, "DSTAT_WB = 0x%x\n",
+ *((uint32_t*)curr_chan->chan->dstat_wb_loc));
+ }
+ return len;
+}
+
+static void
+mic_dma_proc_init(struct mic_dma_ctx_t *dma_ctx)
+{
+ struct proc_dir_entry *dma_proc;
+ char name[64];
+
+ snprintf(name, 63, "%s%d", proc_dma_ring, dma_ctx->device_num);
+ if ((dma_proc = create_proc_entry(name, S_IFREG | S_IRUGO, NULL)) != NULL) {
+ dma_proc->read_proc = mic_dma_proc_read_fn;
+ dma_proc->data = dma_ctx;
+ }
+ snprintf(name, 63, "%s%d", proc_dma_reg, dma_ctx->device_num);
+ if ((dma_proc = create_proc_entry(name, S_IFREG | S_IRUGO, NULL)) != NULL) {
+ dma_proc->read_proc = mic_dma_proc_read_registers_fn;
+ dma_proc->data = dma_ctx;
+ }
+
+}
+#endif // LINUX VERSION
+
+static void
+mic_dma_proc_uninit(struct mic_dma_ctx_t *dma_ctx)
+{
+ char name[64];
+
+ snprintf(name, 63, "%s%d", proc_dma_reg, dma_ctx->device_num);
+ remove_proc_entry(name, NULL);
+ snprintf(name, 63, "%s%d", proc_dma_ring, dma_ctx->device_num);
+ remove_proc_entry(name, NULL);
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include<linux/module.h>
+#include<linux/slab.h>
+#include<asm/io.h>
+#include<linux/kernel.h>
+
+#include <mic/micscif_smpt.h>
+#include <mic/mic_dma_md.h>
+#include <mic/mic_dma_api.h>
+
+#define PR_PREFIX "DMA_LIB_MD:"
+
+#ifdef CONFIG_ML1OM
+#define MIC_DMA_AES_CHAN_NUM 7
+#define is_AES_channel(n) ((n) == MIC_DMA_AES_CHAN_NUM)
+#else
+#define is_AES_channel(n) ((void)(n), 0)
+#endif
+
+#define DMA_CHAN_COOKIE 0xdeadc0d
+
+#define SBOX_DCAR_IM0 (0x1 << 24) // APIC Interrupt mask bit
+#define SBOX_DCAR_IM1 (0x1 << 25) // MSI-X Interrupt mask bit
+#define SBOX_DCAR_IS0 (0x1 << 26) // Interrupt status
+
+#define SBOX_DRARHI_SYS_MASK (0x1 << 26)
+
+#ifdef _MIC_SCIF_
+static inline uint32_t chan_to_dcr_mask(uint32_t dcr, struct md_mic_dma_chan *chan, struct mic_dma_device *dma_dev)
+{
+ uint32_t chan_num = chan->ch_num;
+ uint32_t owner;
+
+ if (!is_AES_channel(chan_num))
+ owner = chan->owner;
+ else
+ owner = chan->endianness;
+
+ return ((dcr & ~(0x1 << (chan_num * 2))) | (owner << (chan_num * 2)));
+}
+#endif
+
+static inline uint32_t drar_hi_to_ba_bits(uint32_t drar_hi)
+{
+ /*
+ * Setting bits 3:2 should generate a DESC_ADDR_ERR but the hardware ignores
+ * these bits currently and doesn't generate the error.
+ */
+#ifdef _MIC_SCIF_
+ return drar_hi & 0xf;
+#else
+ return drar_hi & 0x3;
+#endif
+}
+
+static inline uint32_t physaddr_to_drarhi_ba(phys_addr_t phys_addr)
+{
+ return drar_hi_to_ba_bits((uint32_t)(phys_addr >> 32));
+}
+
+static inline uint32_t size_to_drar_hi_size(uint32_t size)
+{
+ return (size & 0x1ffff) << 4;
+}
+
+static inline uint32_t addr_to_drar_hi_smpt_bits(phys_addr_t mic_phys_addr)
+{
+ return ((mic_phys_addr >> MIC_SYSTEM_PAGE_SHIFT) & 0x1f) << 21;
+}
+
+static inline uint32_t drar_hi_to_smpt(uint32_t drar_hi, uint32_t chan_num)
+{
+ return ((drar_hi >> 21) & 0x1f);
+}
+
+void md_mic_dma_enable_chan(struct mic_dma_device *dma_dev, uint32_t chan_num, bool enable);
+
+
+#ifdef _MIC_SCIF_
+/**
+ * md_mic_dma_chan_init_attr - Set channel attributes like owner and endianness
+ * @chan: The DMA channel handle
+ */
+void md_mic_dma_chan_init_attr(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan)
+{
+ uint32_t dcr;
+
+ CHECK_CHAN(chan);
+
+ dcr = mic_sbox_read_mmio(dma_dev->mm_sbox, SBOX_DCR);
+ dcr = chan_to_dcr_mask(dcr, chan, dma_dev);
+ mic_sbox_write_mmio(dma_dev->mm_sbox, SBOX_DCR, dcr);
+}
+#endif
+
+/* One time DMA Init API */
+void md_mic_dma_init(struct mic_dma_device *dma_dev, uint8_t *mmio_va_base)
+{
+ int i;
+#ifdef _MIC_SCIF_
+ dma_dev->mm_sbox = mic_sbox_md_init();
+#else
+ dma_dev->mm_sbox = mmio_va_base;
+#endif
+ //pr_debug("sbox: va=%p\n", dma_dev.mm_sbox);
+
+ for (i = 0; i < MAX_NUM_DMA_CHAN; i++) {
+ atomic_set(&(dma_dev->chan_info[i].in_use), CHAN_AVAILABLE);
+ dma_dev->chan_info[i].cookie = DMA_CHAN_COOKIE;
+ dma_dev->chan_info[i].dstat_wb_phys = 0;
+ dma_dev->chan_info[i].dstat_wb_loc = NULL;
+ }
+ return;
+}
+
+/* One time DMA Uninit API */
+void md_mic_dma_uninit(struct mic_dma_device *dma_dev)
+{
+ return;
+}
+
+/**
+ * md_mic_dma_request_chan
+ * @owner: DMA channel owner: MIC or Host
+ *
+ * Return - The DMA channel handle or NULL if failed
+ *
+ * Note: Allocating a Host owned channel is not allowed currently
+ */
+struct md_mic_dma_chan *md_mic_dma_request_chan(struct mic_dma_device *dma_dev,
+ enum md_mic_dma_chan_owner owner)
+{
+ struct md_mic_dma_chan *tmp = NULL;
+ int i;
+
+ for (i = 0; i < MAX_NUM_DMA_CHAN; i++) {
+ if (CHAN_AVAILABLE == atomic_cmpxchg(&(dma_dev->chan_info[i].in_use),
+ CHAN_AVAILABLE, CHAN_INUSE)) {
+ tmp = &dma_dev->chan_info[i];
+ tmp->owner = owner;
+ tmp->ch_num = i;
+ /*
+ * Setting endianness by default to MIC_LITTLE_ENDIAN
+ * in case the AES channel is used for clear transfers
+ * This is a don't care for clear transfers.
+ */
+ tmp->endianness = MIC_LITTLE_ENDIAN;
+#ifdef _MIC_SCIF_
+ md_mic_dma_chan_init_attr(dma_dev, tmp);
+#endif
+ break;
+ }
+ }
+ return tmp;
+}
+
+/**
+ * md_mic_dma_free_chan - Frees up a DMA channel
+ * @chan: The DMA channel handle
+ */
+void md_mic_dma_free_chan(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan)
+{
+ CHECK_CHAN(chan);
+ atomic_set(&(chan->in_use), CHAN_AVAILABLE);
+ md_mic_dma_enable_chan(dma_dev, chan->ch_num, false);
+}
+
+/**
+ * md_mic_dma_enable_chan - Enable/disable the DMA channel
+ * @chan_num: The DMA channel
+ * @enable: enable/disable
+ *
+ * Must set desc ring and update head pointer only
+ * after disabling the channel
+ */
+void md_mic_dma_enable_chan(struct mic_dma_device *dma_dev,
+ uint32_t chan_num, bool enable)
+{
+ uint32_t dcr = mic_sbox_read_mmio(dma_dev->mm_sbox, SBOX_DCR);
+
+ /*
+ * There is a separate bit for every channel.
+ * Look up sboxDcrReg.
+ */
+ if (enable) {
+ dcr |= 2 << (chan_num << 1);
+ } else {
+ dcr &= ~(2 << (chan_num << 1));
+ }
+ mic_sbox_write_mmio(dma_dev->mm_sbox, SBOX_DCR, dcr);
+}
+
+#if 0
+uint32_t md_mic_dma_chan_read_completion_count(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan)
+{
+ CHECK_CHAN(chan);
+
+ return (md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DSTAT) & 0xffff);
+}
+
+
+/* This function needs to be used only in error case */
+void update_compcount_and_tail(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan)
+{
+ chan->completion_count = md_mic_dma_chan_read_completion_count(dma_dev, chan);
+ chan->cached_tail = md_mic_dma_chan_read_tail(dma_dev, chan);
+}
+#endif
+void md_mic_dma_chan_set_dstat_wb(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan)
+{
+ uint32_t dstat_wb, dstat_wb_hi;
+ CHECK_CHAN(chan);
+
+ dstat_wb = (uint32_t)chan->dstat_wb_phys;
+ dstat_wb_hi = chan->dstat_wb_phys >> 32;
+ md_mic_dma_write_mmio(dma_dev, chan->ch_num, REG_DSTATWB_LO, dstat_wb);
+ md_mic_dma_write_mmio(dma_dev, chan->ch_num, REG_DSTATWB_HI, dstat_wb_hi);
+}
+
+void md_mic_dma_chan_set_dcherr_msk(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan, uint32_t mask)
+{
+ CHECK_CHAN(chan);
+ md_mic_dma_write_mmio(dma_dev, chan->ch_num, REG_DCHERRMSK, mask);
+}
+#if 0
+uint32_t md_mic_dma_chan_get_dcherr_msk(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan)
+{
+ CHECK_CHAN(chan);
+ return md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DCHERRMSK);
+}
+
+uint32_t md_mic_dma_chan_get_dcherr(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan)
+{
+ CHECK_CHAN(chan);
+ return md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DCHERR);
+}
+
+void md_mic_dma_chan_set_dcherr(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan, uint32_t value)
+{
+ CHECK_CHAN(chan);
+ md_mic_dma_write_mmio(dma_dev, chan->ch_num, REG_DCHERR, value);
+ printk("dcherr = %d\n", md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DCHERR));
+}
+#endif
+
+/**
+ * md_mic_dma_chan_set_desc_ring - Configures the DMA channel desc ring
+ * @chan: The DMA channel handle
+ * @desc_ring_phys_addr: Physical address of the desc ring base. Needs to be
+ * physically contiguous and wired down memory.
+ * @num_desc: Number of descriptors must be a multiple of cache line size.
+ * Descriptor size should be determined using sizeof(union md_mic_dma_desc).
+ * The maximum number of descriptors is defined by
+ * MIC_MAX_NUM_DESC_PER_RING.
+ */
+void md_mic_dma_chan_set_desc_ring(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan,
+ phys_addr_t desc_ring_phys_addr,
+ uint32_t num_desc)
+{
+ uint32_t chan_num;
+ uint32_t drar_lo = 0;
+ uint32_t drar_hi = 0;
+
+ CHECK_CHAN(chan);
+ chan_num = chan->ch_num;
+ /*
+ * TODO: Maybe the 2nd condition should be different considering the
+ * size of union md_mic_dma_desc?
+ */
+ KASSERT((((num_desc) <= MIC_MAX_NUM_DESC_PER_RING) &&
+ (ALIGN((num_desc - (L1_CACHE_BYTES - 1)), L1_CACHE_BYTES) == num_desc)),
+ "num_desc > max or not multiple of cache line num 0x%x", num_desc);
+
+ md_mic_dma_enable_chan(dma_dev, chan_num, false);
+
+ drar_hi = size_to_drar_hi_size(num_desc);
+
+ if (MIC_DMA_CHAN_HOST_OWNED == chan->owner) {
+ drar_hi |= SBOX_DRARHI_SYS_MASK;
+ drar_hi |= addr_to_drar_hi_smpt_bits(desc_ring_phys_addr);
+ }
+ drar_lo = (uint32_t)desc_ring_phys_addr;
+ drar_hi |= physaddr_to_drarhi_ba(desc_ring_phys_addr);
+ md_mic_dma_write_mmio(dma_dev, chan_num, REG_DRAR_LO, drar_lo);
+ md_mic_dma_write_mmio(dma_dev, chan_num, REG_DRAR_HI, drar_hi);
+ chan->num_desc_in_ring = num_desc;
+ pr_debug("md_mic_dma_chan_set_desc_ring addr=0x%llx num=%d drar_hi.bits.pageno 0x%x\n",
+ desc_ring_phys_addr, num_desc,
+ (uint32_t)(desc_ring_phys_addr >> MIC_SYSTEM_PAGE_SHIFT));
+ chan->cached_tail = md_mic_dma_chan_read_tail(dma_dev, chan);
+
+ md_mic_dma_enable_chan(dma_dev, chan_num, true);
+}
+
+uint32_t md_mic_dma_chan_read_head(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan)
+{
+ CHECK_CHAN(chan);
+
+ return md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DHPR);
+}
+
+uint32_t md_mic_dma_chan_read_tail(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan)
+{
+ CHECK_CHAN(chan);
+
+ return md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DTPR);
+}
+
+/**
+ * md_mic_dma_chan_intr_pending - Reads interrupt status to figure out
+ * if an interrupt is pending.
+ * @chan: The DMA channel handle.
+ */
+bool md_mic_dma_chan_intr_pending(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan)
+{
+ uint32_t dcar;
+ CHECK_CHAN(chan);
+
+ dcar = md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DCAR);
+ return (dcar >> 26) & 0x1;
+}
+
+/**
+ * md_mic_dma_chan_mask_intr - Mask or disable interrupts
+ * @chan: The DMA channel handle
+ *
+ * Masking interrupts will also acknowledge any pending
+ * interrupts on the channel.
+ */
+void md_mic_dma_chan_mask_intr(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan)
+{
+ uint32_t dcar;
+ uint32_t chan_num;
+ CHECK_CHAN(chan);
+ chan_num = chan->ch_num;
+
+ dcar = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCAR);
+
+ if (MIC_DMA_CHAN_MIC_OWNED == chan->owner)
+ dcar |= SBOX_DCAR_IM0;
+ else
+ dcar |= SBOX_DCAR_IM1;
+
+ md_mic_dma_write_mmio(dma_dev, chan_num, REG_DCAR, dcar);
+ /*
+ * This read is completed only after previous write is completed.
+ * It guarantees that, interrupts has been acknowledged to SBOX DMA
+ * This read forces previous write to be commited in memory.
+ * This is the actual fix for HSD# 3497216 based on theoretical
+ * hypothesis that somehow previous write is not truly completed
+ * since for writes as long as transactions are accepted by SBOX
+ * ( not necessarily commited in memory) those write transactions
+ * reported as complete.
+ */
+ dcar = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCAR);
+}
+
+/**
+ * md_mic_dma_chan_unmask_intr - Unmask or enable interrupts
+ * @chan: The DMA channel handle
+ */
+void md_mic_dma_chan_unmask_intr(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan)
+{
+ uint32_t dcar;
+ uint32_t chan_num;
+ CHECK_CHAN(chan);
+ chan_num = chan->ch_num;
+
+ dcar = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCAR);
+
+ if (MIC_DMA_CHAN_MIC_OWNED == chan->owner)
+ dcar &= ~SBOX_DCAR_IM0;
+ else
+ dcar &= ~SBOX_DCAR_IM1;
+
+ md_mic_dma_write_mmio(dma_dev, chan_num, REG_DCAR, dcar);
+ /*
+ * This read is completed only after previous write is completed.
+ * It guarantees that, interrupts has been acknowledged to SBOX DMA
+ * This read forces previous write to be commited in memory.
+ * This is the actual fix for HSD# 3497216 based on theoretical
+ * hypothesis that somehow previous write is not truly completed
+ * since for writes as long as transactions are accepted by SBOX
+ * ( not necessarily commited in memory) those write transactions
+ * reported as complete.
+ */
+ dcar = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCAR);
+}
+
+/**
+ * md_mic_dma_chan_get_desc_ring_phys - Compute the value of the descriptor ring
+ * base physical address from the descriptor ring attributes register.
+ * @dma_dev: DMA device.
+ * @chan: The DMA channel handle
+ */
+phys_addr_t
+md_mic_dma_chan_get_desc_ring_phys(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan)
+{
+ phys_addr_t phys, phys_hi;
+ uint32_t phys_lo, chan_num, drar_hi;
+
+ CHECK_CHAN(chan);
+ chan_num = chan->ch_num;
+ phys_lo = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_LO);
+ drar_hi = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_HI);
+ phys_hi = drar_hi_to_ba_bits(drar_hi);
+ phys_hi |= drar_hi_to_smpt(drar_hi, chan_num) << 2;
+
+ phys = phys_lo | (phys_hi << 32);
+ return phys;
+}
+
+/**
+ * md_mic_dma_chan_get_dstatwb_phys - Compute the value of the DSTAT write back
+ * physical address.
+ * @dma_dev: DMA device.
+ * @chan: The DMA channel handle
+ */
+phys_addr_t md_mic_dma_chan_get_dstatwb_phys(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan)
+{
+ uint32_t reg, chan_num;
+ phys_addr_t phys;
+
+ CHECK_CHAN(chan);
+ chan_num = chan->ch_num;
+ reg = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTATWB_HI);
+ phys = reg;
+ reg = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTATWB_LO);
+
+ phys = phys << 32 | reg;
+ return phys;
+}
+
+/**
+ * md_mic_dma_prep_nop_desc - Prepares a NOP descriptor.
+ * @desc: Descriptor to be populated.
+ *
+ * This descriptor is used to pad a cacheline if the previous
+ * descriptor does not end on a cacheline boundary.
+ */
+void md_mic_dma_prep_nop_desc(union md_mic_dma_desc *desc)
+{
+ KASSERT((desc != 0), ("NULL desc"));
+
+ desc->qwords.qw0 = 0;
+ desc->qwords.qw1 = 0;
+ desc->desc.nop.type = 0;
+}
+
+/* Only Debug Code Below */
+
+/**
+ * md_mic_dma_print_debug - Print channel debug information
+ * @chan: The DMA channel handle
+ * @sbuf: Print to an sbuf if not NULL else prints to console
+ */
+void md_mic_dma_print_debug(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan)
+{
+ uint32_t dcr;
+ uint32_t dcar;
+ uint32_t dtpr;
+ uint32_t dhpr;
+ uint32_t drar_lo;
+ uint32_t drar_hi;
+ uint32_t dstat;
+ uint32_t chan_num = chan->ch_num;
+
+ dcr = mic_sbox_read_mmio(dma_dev->mm_sbox, SBOX_DCR);
+ dcar = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DCAR);
+ dtpr = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DTPR);
+ dhpr = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DHPR);
+ drar_lo = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_LO);
+ drar_hi = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DRAR_HI);
+ dstat = md_mic_dma_read_mmio(dma_dev, chan_num, REG_DSTAT);
+ pr_debug(PR_PREFIX "Chan_Num 0x%x DCR 0x%x DCAR 0x%x DTPR 0x%x"
+ "DHPR 0x%x DRAR_HI 0x%x DRAR_LO 0x%x DSTAT 0x%x\n",
+ chan_num, dcr, dcar, dtpr, dhpr, drar_hi, drar_lo, dstat);
+ pr_debug(PR_PREFIX "DCR 0x%x\n", dcr);
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include<linux/module.h>
+#include<linux/init.h>
+#include<asm/io.h>
+
+#include <mic/mic_sbox_md.h>
+#include <mic/micsboxdefine.h>
+
+#define PR_PREFIX "SBOX:"
+
+extern void *mic_sbox_mmio_va;
+
+void *mic_sbox_md_init(void)
+{
+ return mic_sbox_mmio_va;
+}
+
+void mic_sbox_md_uninit(void *mic_sbox_mmio_va)
+{
+ iounmap(mic_sbox_mmio_va);
+ pr_debug(PR_PREFIX "Uninitialized sbox md\n");
+}
+
--- /dev/null
+#
+# Manycore Throughput Linux Driver
+# Copyright (c) 2010, Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms and conditions of the GNU General Public License,
+# version 2, as published by the Free Software Foundation.
+#
+# This program is distributed in the hope it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+# more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#
+
+KERNELDIR = /lib/modules/$(shell uname -r)/build
+KBUILD := $(MAKE) -C $(KERNELDIR) M=$(CURDIR)
+EXTRADIR = $(shell readlink -f $(KERNELDIR))
+
+ifneq ($(DESTDIR),)
+INSTALL_MOD_PATH = $(DESTDIR)
+endif
+
+.PHONY: default modules install modules_install clean
+
+default: modules
+install: modules_install udev
+
+modules:
+ +$(KBUILD) $@
+
+modules_install:
+ +$(KBUILD) INSTALL_MOD_PATH=$(DESTDIR) modules_install
+ mkdir -p $(DESTDIR)$(EXTRADIR)/include
+ install -m644 include/scif.h $(DESTDIR)$(EXTRADIR)/include
+ install -m644 Module.symvers $(DESTDIR)$(EXTRADIR)/Module.symvers.mic
+
+udev: udev-scif.rules
+ mkdir -p $(DESTDIR)/etc/udev/rules.d
+ cp $< $(DESTDIR)/etc/udev/rules.d/50-$<
+
+clean:
+ +$(KBUILD) clean
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <micint.h>
+
+#include <scif.h>
+#include <mic_common.h>
+
+#define ACPT_BACKLOG 120
+#define ACPT_POLL_MS 2000
+
+#define ACPT_BOOTED 1
+#define ACPT_BOOT_ACK 2
+#define ACPT_NACK_VERSION 3
+#define ACPT_REQUEST_TIME 4
+#define ACPT_TIME_DATA 5
+
+#define ACPT_VERSION 1
+
+static acptboot_data_t *acptboot_data;
+
+
+void acptboot_getconn(struct work_struct *work)
+{
+ mic_ctx_t *node_ctx;
+ struct scif_portID data;
+ scif_epd_t conn_epd;
+ struct timespec tod;
+ int proto;
+ int version;
+ int err;
+
+ if ((err = scif_accept(acptboot_data->listen_epd, &data, &conn_epd,
+ SCIF_ACCEPT_SYNC))) {
+ pr_debug("ACPTBOOT: scif_accept_failed %d\n", err);
+ return;
+
+ //goto requeue_accept;
+ }
+
+ if (!data.node) {
+ printk(KERN_ERR "ACPTBOOT: connect received from invalid dev %d\n",
+ -EINVAL);
+ goto close_epd;
+ }
+
+ if ((err = scif_recv(conn_epd, &version, sizeof(version), SCIF_RECV_BLOCK)) != sizeof(version)) {
+ printk(KERN_ERR "ACPTBOOT: failed to recieve version number err %d\n", err);
+ goto close_epd;
+ }
+
+ if ((err = scif_recv(conn_epd, &proto, sizeof(proto), SCIF_RECV_BLOCK)) != sizeof(proto)) {
+ printk(KERN_ERR "ACPTBOOT: failed to recieve proto id %d\n", err);
+ goto close_epd;
+ }
+
+ switch (proto) {
+ case ACPT_BOOTED:
+ node_ctx = get_per_dev_ctx(data.node - 1);
+ mic_setstate(node_ctx, MIC_ONLINE);
+ node_ctx->boot_count++;
+
+ proto = ACPT_BOOT_ACK;
+ scif_send(conn_epd, &proto, sizeof(proto), SCIF_SEND_BLOCK);
+ break;
+
+ case ACPT_REQUEST_TIME:
+ getnstimeofday(&tod);
+ proto = ACPT_TIME_DATA;
+ scif_send(conn_epd, &proto, sizeof(proto), SCIF_SEND_BLOCK);
+ scif_send(conn_epd, &tod, sizeof(tod), SCIF_SEND_BLOCK);
+ break;
+ }
+
+close_epd:
+ if ((err = scif_close(conn_epd)))
+ printk(KERN_ERR "ACPTBOOT: scif_close failed %d\n", err);
+
+//requeue_accept:
+ queue_work(acptboot_data->acptbootwq, &acptboot_data->acptbootwork);
+}
+
+void acptboot_exit(void)
+{
+ int err = 0;
+ if (acptboot_data) {
+ if (acptboot_data->listen_epd)
+ if ((err = scif_close(acptboot_data->listen_epd)) < 0)
+ pr_debug("scif_close failed %d\n", err);
+ destroy_workqueue(acptboot_data->acptbootwq);
+
+ kfree(acptboot_data);
+ }
+}
+
+int
+acptboot_init(void)
+{
+ int err, ret;
+
+ acptboot_data = (acptboot_data_t *)kzalloc(sizeof(*acptboot_data), GFP_KERNEL);
+
+ if (!acptboot_data) {
+ printk(KERN_ERR "ACPTBOOT: memory allocation failure\n");
+ return -ENOMEM;
+ }
+
+ acptboot_data->listen_epd = scif_open();
+
+ if (!acptboot_data->listen_epd) {
+ printk(KERN_ERR "ACPTBOOT: scif_open() failed!\n");
+ err = -ENOMEM;
+ goto error;
+ }
+
+ err = scif_bind(acptboot_data->listen_epd, MIC_NOTIFY);
+ if (err < 0) {
+ pr_debug("ACPTBOOT: scif_bind() failed! %d\n", err);
+ goto error;
+ }
+
+ acptboot_data->acptboot_pn = err;
+
+ err = scif_listen(acptboot_data->listen_epd, ACPT_BACKLOG);
+ if (err < 0) {
+ pr_debug("scif_listen() failed! %d\n", err);
+ goto error;
+
+ }
+
+ pr_debug("ACPT endpoint listening port %d\n",
+ acptboot_data->acptboot_pn);
+
+ // Create workqueue
+ acptboot_data->acptbootwq = __mic_create_singlethread_workqueue(
+ "ACPTBOOT_WQ");
+
+ if (!acptboot_data->acptbootwq) {
+ printk(KERN_ERR "%s %d wq creation failed!\n", __func__, __LINE__);
+ goto error;
+ }
+
+ INIT_WORK(&acptboot_data->acptbootwork, acptboot_getconn);
+ queue_work(acptboot_data->acptbootwq,
+ &acptboot_data->acptbootwork);
+ return 0;
+
+error:
+
+ if (acptboot_data->listen_epd)
+ if ((ret = scif_close(acptboot_data->listen_epd)) < 0)
+ pr_debug("ACPTBOOT: scif_close() failed! %d\n", ret);
+
+ kfree(acptboot_data);
+
+ return err;
+}
+
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/* contains code to handle MIC IO control codes */
+
+#include "mic_common.h"
+
+static int do_send_flash_cmd(mic_ctx_t *mic_ctx, struct ctrlioctl_flashcmd *args);
+static int get_card_mem(mic_ctx_t *mic_ctx, struct ctrlioctl_cardmemcpy *args);
+
+/*
+ DESCRIPTION:: Gets the opcode from the input buffer and call appropriate method
+ PARAMETERS::
+ [in]mic_ctx_t *mic_ctx - pointer to the mic private context
+ [in]void *in_buffer - input buffer containing opcode + ioctl arguments,
+ RETURN_VALUE:: 0 if successful, non-zero if failure
+*/
+int
+adapter_do_ioctl(uint32_t cmd, uint64_t arg)
+{
+ int status = 0;
+ mic_ctx_t *mic_ctx = NULL;
+
+ void __user *argp = (void __user *)arg;
+ switch (cmd) {
+
+ case IOCTL_FLASHCMD:
+ {
+ struct ctrlioctl_flashcmd args = {0};
+
+ if (copy_from_user(&args, argp, sizeof(struct ctrlioctl_flashcmd))) {
+ return -EFAULT;
+ }
+
+ if (args.brdnum >= (uint32_t)mic_data.dd_numdevs) {
+ printk(KERN_ERR "IOCTL error: given board num is invalid\n");
+ return -EINVAL;
+ }
+
+ mic_ctx = get_per_dev_ctx(args.brdnum);
+ if (!mic_ctx) {
+ printk(KERN_ERR "IOCTL error: null mic context\n");
+ return -ENODEV;
+ }
+
+ /* Make sure we are running in flash mode */
+ if (mic_ctx->mode != MODE_FLASH || mic_ctx->state != MIC_ONLINE) {
+ printk(KERN_ERR "%s Card is not online in flash mode or online state\n", __func__);
+ return -EPERM;
+ }
+
+ if (mic_ctx->bi_family != FAMILY_KNC) {
+ printk(KERN_ERR "%s IOCTL_FLASHCMD not supported for non KNC family cards\n", __func__);
+ return -EPERM;
+ }
+
+ status = do_send_flash_cmd(mic_ctx, &args);
+ if (status) {
+ printk(KERN_ERR "IOCTL error: failed to complete IOCTL for bdnum %d\n", args.brdnum);
+ return status;
+ }
+
+ if (copy_to_user(argp, &args, sizeof(struct ctrlioctl_flashcmd))) {
+ return -EFAULT;
+ }
+
+ break;
+ }
+
+ case IOCTL_CARDMEMCPY:
+ {
+ struct ctrlioctl_cardmemcpy args = {0};
+
+ if (copy_from_user(&args, argp, sizeof(struct ctrlioctl_cardmemcpy))) {
+ return -EFAULT;
+ }
+
+ if (args.brdnum >= (uint32_t)mic_data.dd_numdevs) {
+ printk(KERN_ERR "IOCTL error: given board num is invalid\n");
+ return -EINVAL;
+ }
+ mic_ctx = get_per_dev_ctx(args.brdnum);
+ if (!mic_ctx) {
+ printk(KERN_ERR "IOCTL error: null mic context\n");
+ return -ENODEV;
+ }
+
+ if(mic_ctx->state != MIC_ONLINE || mic_ctx->mode != MODE_LINUX) {
+ status = -EPERM;
+ printk("Error ! Card not in linux mode or online state!\n");
+ return status;
+ }
+
+ status = get_card_mem(mic_ctx, &args);
+ if (status) {
+ printk(KERN_ERR "IOCTL error: failed to complete IOCTL for bdnum %d\n", args.brdnum);
+ return status;
+ }
+
+ ;
+ break;
+ }
+
+ default:
+ printk("Invalid IOCTL\n");
+ status = -EINVAL;
+ break;
+ }
+
+ return status;
+}
+
+int
+do_send_flash_cmd(mic_ctx_t *mic_ctx, struct ctrlioctl_flashcmd *args)
+{
+ int status = 0;
+
+ if(!capable(CAP_SYS_ADMIN)) {
+ printk(KERN_ERR "Cannot execute unless sysadmin\n");
+ return -EACCES;
+ }
+
+ pr_debug("%s\n IN:: brdnum = %d, type = %x, data = %p, len = %x\n",
+ __func__, args->brdnum, args->type, args->data, args->len);
+
+ status = send_flash_cmd(mic_ctx, args->type, args->data, args->len);
+
+ return status;
+}
+
+
+int
+get_card_mem(mic_ctx_t *mic_ctx, struct ctrlioctl_cardmemcpy *args)
+{
+ int32_t status = 0;
+
+ if(!capable(CAP_SYS_ADMIN)) {
+ printk(KERN_ERR "Cannot execute unless sysadmin\n");
+ return -EACCES;
+ }
+
+ if (args->dest == NULL) {
+ status = EINVAL;
+ goto exit;
+ }
+ pr_debug("%s\n IN:: brdnum = %d, start = %qx, size = %qx, dest = %p\n",
+ __func__, args->brdnum, args->start, args->size, args->dest);
+
+ status = get_cardside_mem(mic_ctx, args->start, args->size, args->dest);
+
+exit:
+ return status;
+
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+#include "micint.h"
+#include "mic/micveth.h"
+
+/*
+ * Retrieves the device context for a particular device
+ */
+mic_ctx_t *
+get_device_context(struct pci_dev *dev) {
+ int i = 0;
+ mic_ctx_t *mic_ctx = NULL;
+ for (i = (mic_data.dd_numdevs -1); i >= 0; i--) {
+ mic_ctx = &mic_data.dd_bi[i]->bi_ctx;
+ if (mic_ctx!= NULL) {
+ //TODO: Is bus number enough to uniquely identify a
+ //pci_dev struct in mic_ctx?
+ if (mic_ctx->bi_pdev->bus->number ==
+ dev->bus->number) {
+
+ //Bus number matches
+ break;
+ }
+ }
+ }
+ return mic_ctx;
+}
+
+/*
+ * Notifier callback with event specifying the actual power management
+ * event to have happened.Our events of Interest right now are:
+ * PM_HIBERNATION_PREPARE and PM_POST_RESTORE
+ */
+int
+micpm_notifier_block(struct notifier_block *nb, unsigned long event, void *dummy)
+{
+ int i;
+ mic_ctx_t *mic_ctx;
+ switch (event) {
+ case PM_POST_RESTORE:
+ case PM_POST_SUSPEND:
+ case PM_POST_HIBERNATION:
+ pr_debug("%s Calling MIC resume\n", __func__);
+ for(i = 0; i < mic_data.dd_numdevs; i++) {
+ mic_ctx = get_per_dev_ctx(i);
+ if (mic_ctx && mic_ctx->micpm_ctx.resume.wq) {
+ queue_work(mic_ctx->micpm_ctx.resume.wq,
+ &mic_ctx->micpm_ctx.resume.work);
+ }
+ }
+ break;
+ default:
+ pr_debug("%s: Unrecognized event %lu\n", __func__, event);
+ break;
+ }
+return 0;
+}
+
+/*
+ * Called by the OS when going into suspend.
+ * Puts our device to D3Cold.
+ */
+int
+micpm_suspend(struct device *pdev)
+{
+ struct pci_dev *pci_dev = to_pci_dev(pdev);
+ mic_ctx_t *mic_ctx = get_device_context(pci_dev);
+
+ if (!pci_dev) {
+ pr_debug("Not initialized, aborting suspend.\n");
+ return -ENODEV;
+ }
+
+ pr_debug("pm_stop_device called for dev: %d:%d:%d\n", pci_dev->bus->number,
+ PCI_SLOT(pci_dev->devfn), PCI_FUNC(pci_dev->devfn));
+ pm_stop_device(mic_ctx);
+ pci_save_state(pci_dev);
+ pci_disable_device(pci_dev);
+ if (pci_set_power_state(pci_dev, PCI_D3cold))
+ pr_debug("Not able to set to D3Cold state\n");
+ pr_debug("Returning from mic_suspend\n");
+ return 0;
+}
+
+/*
+ * Called by the OS when coming out of suspend.
+ * Puts our device to D0 and starts driver components.
+ */
+int
+micpm_resume(struct device *pdev)
+{
+ struct pci_dev *pci_dev = to_pci_dev(pdev);
+ if (!pci_dev) {
+ pr_debug("Device not initialized. aborting resume");
+ return -ENODEV;
+ }
+
+ pci_set_power_state(pci_dev, PCI_D0);
+ if (pci_enable_device(pci_dev)) {
+ pr_debug("Failed to wake-up device.\n");
+ return -EIO;
+ }
+ pci_restore_state(pci_dev);
+ pci_set_master(pci_dev);
+ pr_debug("pm_start_device called for dev: %d:%d:%d\n", pci_dev->bus->number,
+ PCI_SLOT(pci_dev->devfn), PCI_FUNC(pci_dev->devfn));
+ return 0;
+}
+
+int micpm_suspend_noirq(struct device *pdev) {
+
+ struct pci_dev *pci_dev = to_pci_dev(pdev);
+ mic_ctx_t *mic_ctx;
+ bd_info_t *bd_info;
+
+ if (!pci_dev) {
+ pr_debug("Device not initialized. aborting suspend");
+ return -ENODEV;
+ }
+
+ mic_ctx = get_device_context(pci_dev);
+ if(mic_ctx) {
+ bd_info = mic_ctx->bd_info;
+ /* MSI interrupts do not work on resume.
+ * See http://www.digipedia.pl/usenet/thread/18815/2513/
+ * for a discussion on this issue.
+ */
+ if (mic_ctx->msie) {
+ free_irq(bd_info->bi_msix_entries[0].vector, &bd_info->bi_ctx);
+ }
+ }
+ return 0;
+}
+
+int micpm_resume_noirq(struct device *pdev) {
+
+ struct pci_dev *pci_dev = to_pci_dev(pdev);
+ mic_ctx_t *mic_ctx;
+ bd_info_t *bd_info;
+ int err;
+
+ if (!pci_dev) {
+ pr_debug("Device not initialized. aborting resume");
+ return -ENODEV;
+ }
+ mic_ctx = get_device_context(pci_dev);
+ if(mic_ctx) {
+ bd_info = mic_ctx->bd_info;
+
+ /* MSI interrupts do not work on resume.
+ * See http://www.digipedia.pl/usenet/thread/18815/2513/
+ * for a discussion on this issue.
+ */
+ if (mic_ctx->msie) {
+ err = request_irq(bd_info->bi_msix_entries[0].vector,
+ mic_irq_isr, 0, "mic", mic_ctx);
+ if (err) {
+ pr_debug("%s: %d Error inititalizing MSI interrupts\n",
+ __func__, __LINE__);
+ return 0;
+ }
+ }
+
+ }
+ return 0;
+}
+
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "micint.h"
+
+int mic_psmi_open(struct file *filp)
+{
+ bd_info_t *bd_info = mic_data.dd_bi[0];
+ if (!bd_info->bi_ctx.bi_psmi.enabled)
+ return -EINVAL;
+ ((filp)->private_data) = &bd_info->bi_ctx;
+ return 0;
+}
+
+extern int usagemode_param;
+
+ssize_t mic_psmi_read(struct file * filp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ ssize_t total_bytes = 0;
+ unsigned int pg_no, pg_off, bytes;
+ mic_ctx_t *mic_ctx = ((filp)->private_data);
+ struct mic_psmi_ctx *psmi_ctx = &mic_ctx->bi_psmi;
+ loff_t mem_size;
+
+ if (!psmi_ctx->enabled)
+ return -EINVAL;
+ if (FAMILY_ABR == mic_ctx->bi_family &&
+ USAGE_MODE_NORMAL != usagemode_param)
+ mem_size = MIC_APERTURE_SIZE;
+ else
+ mem_size = psmi_ctx->dma_mem_size;
+ if (*pos >= mem_size || count <= 0)
+ return 0;
+ if (*pos + count > mem_size)
+ count = mem_size - *pos;
+ /* read aperture memory */
+ if (USAGE_MODE_NORMAL != usagemode_param) {
+ if (copy_to_user(buf,
+ mic_ctx->aper.va + *pos, count))
+ return -EFAULT;
+ goto read_exit;
+ }
+ /* read host memory allocated for psmi handler */
+ pg_no = *pos / MIC_PSMI_PAGE_SIZE;
+ pg_off = *pos % MIC_PSMI_PAGE_SIZE;
+ while (total_bytes < count) {
+ pci_dma_sync_single_for_cpu(mic_ctx->bi_pdev,
+ psmi_ctx->dma_tbl[pg_no + 1].pa,
+ MIC_PSMI_PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+ bytes = MIC_PSMI_PAGE_SIZE - pg_off;
+ if (total_bytes + bytes > count)
+ bytes = count - total_bytes;
+ if (copy_to_user(buf,
+ (void *)psmi_ctx->va_tbl[pg_no].pa + pg_off, bytes))
+ return -EFAULT;
+ total_bytes += bytes;
+ buf += bytes;
+ pg_no++;
+ /* Only the first page needs an offset */
+ pg_off = 0;
+ }
+read_exit:
+ *pos += count;
+ return count;
+}
+
+static ssize_t show_mem_size(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ mic_ctx_t *mic_ctx = dev_get_drvdata(dev);
+ struct mic_psmi_ctx *psmi_ctx = &mic_ctx->bi_psmi;
+
+ return snprintf(buf, PAGE_SIZE, "%ld\n",
+ (unsigned long)psmi_ctx->dma_mem_size);
+}
+static DEVICE_ATTR(mem_size, S_IRUGO, show_mem_size, NULL);
+
+static struct attribute *psmi_attributes[] = {
+ &dev_attr_mem_size.attr,
+ NULL
+};
+
+struct attribute_group psmi_attr_group = {
+ .attrs = psmi_attributes
+};
+
+#if (defined(RHEL_RELEASE_CODE) && \
+ (LINUX_VERSION_CODE == KERNEL_VERSION(2,6,32))) || \
+ LINUX_VERSION_CODE > KERNEL_VERSION(2,6,34)
+static ssize_t mic_psmi_read_ptes(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf, loff_t pos, size_t size)
+#else
+static ssize_t mic_psmi_read_ptes(struct kobject *kobj,
+ struct bin_attribute *attr, char *buf, loff_t pos, size_t size)
+#endif
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct mic_psmi_ctx *psmi_ctx =
+ &((mic_ctx_t *)dev_get_drvdata(dev))->bi_psmi;
+
+ if (pos >= psmi_ctx->dma_tbl_size || size <= 0)
+ return 0;
+ if (pos + size > psmi_ctx->dma_tbl_size)
+ size = psmi_ctx->dma_tbl_size - pos;
+ memcpy(buf, psmi_ctx->dma_tbl, size);
+ return size;
+}
+
+struct bin_attribute mic_psmi_ptes_attr = {
+ .attr = {
+ .name = "psmi_ptes",
+ .mode = S_IRUSR
+ },
+ .read = mic_psmi_read_ptes
+};
+
+extern bool mic_psmi_enable;
+module_param_named(psmi, mic_psmi_enable, bool, S_IRUSR);
+MODULE_PARM_DESC(psmi, "Enable/disable mic psmi");
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "mic_common.h"
+#include "mic/micscif_smpt.h"
+#include "mic/micscif_nodeqp.h"
+#include "mic/micscif_intr.h"
+#include "mic/micscif_nm.h"
+#include "micint.h"
+
+struct micscif_info ms_info;
+struct micscif_dev scif_dev[MAX_BOARD_SUPPORTED + 1];
+
+bool mic_watchdog_enable = 1;
+bool mic_watchdog_auto_reboot = 1;
+bool mic_crash_dump_enabled = 1;
+
+int
+micscif_init(void)
+{
+ int err;
+ ms_info.mi_nodeid = 0; // Host is node 0
+ ms_info.mi_maxid = 0; // Host is at start the max card ID
+ ms_info.mi_total = 1; // Host will know about this many MIC cards
+ ms_info.mi_mask = 1; // first bit in the mask is the host node
+
+ mutex_init (&ms_info.mi_conflock);
+ spin_lock_init(&ms_info.mi_eplock);
+ spin_lock_init(&ms_info.mi_connlock);
+ spin_lock_init(&ms_info.mi_rmalock);
+ mutex_init (&ms_info.mi_fencelock);
+ mutex_init (&ms_info.mi_event_cblock);
+ spin_lock_init(&ms_info.mi_nb_connect_lock);
+ INIT_LIST_HEAD(&ms_info.mi_uaccept);
+ INIT_LIST_HEAD(&ms_info.mi_listen);
+ INIT_LIST_HEAD(&ms_info.mi_zombie);
+ INIT_LIST_HEAD(&ms_info.mi_connected);
+ INIT_LIST_HEAD(&ms_info.mi_disconnected);
+ INIT_LIST_HEAD(&ms_info.mi_rma);
+ INIT_LIST_HEAD(&ms_info.mi_rma_tc);
+#ifdef CONFIG_MMU_NOTIFIER
+ INIT_LIST_HEAD(&ms_info.mi_mmu_notif_cleanup);
+#endif
+ INIT_LIST_HEAD(&ms_info.mi_fence);
+ INIT_LIST_HEAD(&ms_info.mi_event_cb);
+ INIT_LIST_HEAD(&ms_info.mi_nb_connect_list);
+ ms_info.mi_watchdog_to = DEFAULT_WATCHDOG_TO;
+#ifdef MIC_IS_EMULATION
+ ms_info.mi_watchdog_enabled = 0;
+ ms_info.mi_watchdog_auto_reboot = 0;
+#else
+ ms_info.mi_watchdog_enabled = mic_watchdog_enable;
+ ms_info.mi_watchdog_auto_reboot = mic_watchdog_auto_reboot;
+#endif
+#ifdef RMA_DEBUG
+ ms_info.rma_unaligned_cpu_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0);
+ ms_info.rma_alloc_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0);
+ ms_info.rma_pin_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0);
+#ifdef CONFIG_MMU_NOTIFIER
+ ms_info.mmu_notif_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0);
+#endif
+#endif
+ ms_info.mi_misc_wq = __mic_create_singlethread_workqueue("SCIF_MISC");
+ if (!ms_info.mi_misc_wq) {
+ err = -ENOMEM;
+ goto wq_error;
+ }
+ INIT_WORK(&ms_info.mi_misc_work, micscif_misc_handler);
+#ifdef CONFIG_MMU_NOTIFIER
+ ms_info.mi_mmu_notif_wq = create_singlethread_workqueue("SCIF_MMU");
+ if (!ms_info.mi_mmu_notif_wq) {
+ err = -ENOMEM;
+ goto wq_error;
+ }
+ INIT_WORK(&ms_info.mi_mmu_notif_work, micscif_mmu_notif_handler);
+#endif
+ ms_info.mi_conn_wq = __mic_create_singlethread_workqueue("SCIF_NB_CONN");
+ if (!ms_info.mi_conn_wq) {
+ err = -ENOMEM;
+ goto wq_error;
+ }
+ INIT_WORK(&ms_info.mi_conn_work, micscif_conn_handler);
+
+ //pr_debug("micscif_create(%d) \n", num_bds);
+
+ // Setup information for self aka loopback.
+ scif_dev[SCIF_HOST_NODE].sd_node = SCIF_HOST_NODE;
+ micscif_setup_loopback_qp(&scif_dev[SCIF_HOST_NODE]);
+ scif_dev[SCIF_HOST_NODE].sd_state = SCIFDEV_RUNNING;
+ scif_dev[SCIF_HOST_NODE].scif_ref_cnt =
+ (atomic_long_t) ATOMIC_LONG_INIT(0);
+ scif_dev[SCIF_HOST_NODE].scif_map_ref_cnt = 0;
+ init_waitqueue_head(&scif_dev[SCIF_HOST_NODE].sd_wq);
+ init_waitqueue_head(&scif_dev[SCIF_HOST_NODE].sd_mmap_wq);
+ mutex_init (&scif_dev[SCIF_HOST_NODE].sd_lock);
+ ms_info.mi_rma_tc_limit = SCIF_RMA_TEMP_CACHE_LIMIT;
+ ms_info.en_msg_log = 0;
+ scif_proc_init();
+ return 0;
+wq_error:
+ if (ms_info.mi_misc_wq)
+ destroy_workqueue(ms_info.mi_misc_wq);
+#ifdef CONFIG_MMU_NOTIFIER
+ if (ms_info.mi_mmu_notif_wq)
+ destroy_workqueue(ms_info.mi_mmu_notif_wq);
+#endif
+ if (ms_info.mi_conn_wq)
+ destroy_workqueue(ms_info.mi_conn_wq);
+ return err;
+}
+
+void
+micscif_destroy(void)
+{
+ struct list_head *pos, *unused;
+ struct scif_callback *temp;
+#ifdef CONFIG_MMU_NOTIFIER
+ destroy_workqueue(ms_info.mi_mmu_notif_wq);
+#endif
+ destroy_workqueue(ms_info.mi_misc_wq);
+ destroy_workqueue(ms_info.mi_conn_wq);
+ micscif_destroy_loopback_qp(&scif_dev[SCIF_HOST_NODE]);
+ scif_proc_cleanup();
+ mic_debug_uninit();
+ list_for_each_safe(pos, unused, &ms_info.mi_event_cb) {
+ temp = list_entry(pos, struct scif_callback, list_member);
+ list_del(pos);
+ kfree(temp);
+ }
+ mutex_destroy(&ms_info.mi_event_cblock);
+}
+
+int
+micscif_host_doorbell_intr_handler(mic_ctx_t *mic_ctx, int doorbell)
+{
+ struct micscif_dev *dev = &scif_dev[mic_ctx->bi_id + 1];
+
+ queue_work(dev->sd_intr_wq, &dev->sd_intr_bh);
+ return 0;
+}
+
+int micscif_setup_host_qp(mic_ctx_t *mic_ctx, struct micscif_dev *scifdev);
+
+void
+micscif_probe(mic_ctx_t *mic_ctx)
+{
+ struct micscif_dev *scifdev = &scif_dev[mic_ctx->bi_id + 1];
+
+ // The host needs to keep track of scif_dev interfaces for all boards in
+ // the system. Host is node zero for MIC board 0 is SCIF node 1, etc.
+ // This will need to become more dynamic if hot plug is supported
+
+ scifdev->sd_node = mic_ctx->bi_id + 1;
+ scifdev->sd_state = SCIFDEV_STOPPED;
+ scifdev->mm_sbox = mic_ctx->mmio.va + HOST_SBOX_BASE_ADDRESS;
+
+ /* This workqueue thread will handle all card->host interrupt processing. */
+ micscif_setup_interrupts(scifdev);
+
+ init_waitqueue_head(&scifdev->sd_mmap_wq);
+ init_waitqueue_head(&scifdev->sd_wq);
+ mutex_init (&scifdev->sd_lock);
+ INIT_LIST_HEAD(&scifdev->sd_p2p);
+
+ init_waitqueue_head(&scifdev->sd_watchdog_wq);
+ snprintf(scifdev->sd_ln_wqname, sizeof(scifdev->sd_intr_wqname),
+ "SCIF LOSTNODE %d", scifdev->sd_node);
+ if (!(scifdev->sd_ln_wq =
+ __mic_create_singlethread_workqueue(scifdev->sd_ln_wqname)))
+ printk(KERN_ERR "%s %d wq creation failed\n", __func__, __LINE__);
+ INIT_DELAYED_WORK(&scifdev->sd_watchdog_work, micscif_watchdog_handler);
+ /*
+ * Register function for doorbell 0 which will
+ * basically kick off the workqueue.
+ */
+ mic_reg_irqhandler(mic_ctx, 0, "SCIF DoorBell 0",
+ micscif_host_doorbell_intr_handler);
+}
+
+void
+micscif_start(mic_ctx_t *mic_ctx)
+{
+ struct micscif_dev *scifdev = &scif_dev[mic_ctx->bi_id + 1];
+
+ scifdev->scif_ref_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0);
+ scifdev->scif_map_ref_cnt = 0;
+
+ scifdev->sd_state = SCIFDEV_INIT;
+
+
+ /* Sets up bd_bs and the host side of the queuepair */
+ pr_debug("micscif_probe: host setting up qp \n");
+ micscif_setup_host_qp(mic_ctx, scifdev);
+}
+
+void micscif_removehost_respose(struct micscif_dev *scifdev, struct nodemsg *msg);
+
+void
+micscif_stop(mic_ctx_t *mic_ctx)
+{
+ struct micscif_dev *scifdev = &scif_dev[mic_ctx->bi_id + 1];
+
+ if (scifdev->sd_state == SCIFDEV_STOPPED || scifdev->sd_state == SCIFDEV_INIT)
+ return;
+
+ micscif_disconnect_node(scifdev->sd_node, NULL, DISCONN_TYPE_LOST_NODE);
+}
+
+void
+micscif_remove(mic_ctx_t *mic_ctx)
+{
+ struct micscif_dev *scifdev = &scif_dev[mic_ctx->bi_id + 1];
+ struct micscif_qp *qp = &scifdev->qpairs[0];
+
+ destroy_workqueue(scifdev->sd_intr_wq);
+ scifdev->sd_intr_wq = 0;
+ cancel_delayed_work_sync(&scifdev->sd_watchdog_work);
+ if (scifdev->sd_ln_wq){
+ destroy_workqueue(scifdev->sd_ln_wq);
+ scifdev->sd_ln_wq = 0;
+ }
+ mic_unreg_irqhandler(mic_ctx, 0x0, "SCIF DoorBell 0");
+
+ if (qp) {
+ mic_ctx_unmap_single(mic_ctx, qp->local_buf, qp->inbound_q.size);
+ mic_ctx_unmap_single(mic_ctx, qp->local_qp, sizeof(struct micscif_qp));
+ kfree((void*)(qp->inbound_q.rb_base));
+ }
+
+ if (scifdev->qpairs) {
+ kfree(scifdev->qpairs);
+ scifdev->qpairs = NULL;
+ }
+}
+
+int
+scif_get_node_status(int node_id)
+{
+ struct micscif_dev *scifdev = &scif_dev[node_id];
+
+ return scifdev->sd_state;
+}
+
+struct scatterlist *
+micscif_p2p_mapsg(void *va, int page_size, int page_cnt)
+{
+ struct scatterlist *sg;
+ struct page *page;
+ int i;
+
+ if ((sg = kcalloc(page_cnt, sizeof(struct scatterlist), GFP_KERNEL)) == NULL) {
+ return NULL;
+ }
+
+ sg_init_table(sg, page_cnt);
+
+ for (i = 0; i < page_cnt; i++) {
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0))
+ phys_addr_t phys;
+ phys = slow_virt_to_phys(va);
+
+ if ((page = pfn_to_page(phys >> PAGE_SHIFT)) == NULL)
+ goto p2p_sg_err;
+#else
+ if ((page = vmalloc_to_page(va)) == NULL)
+ goto p2p_sg_err;
+#endif
+ sg_set_page(&sg[i], page, page_size, 0);
+ va += page_size;
+ }
+
+ return sg;
+
+p2p_sg_err:
+ kfree(sg);
+ return NULL;
+}
+
+void
+micscif_p2p_freesg(struct scatterlist *sg)
+{
+ kfree(sg);
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "micint.h"
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/kernel.h>
+#include "mic/micveth.h"
+
+#define SBOX_SCR9_VENDORID(x) ((x) & 0xf)
+#define SBOX_SCR9_REVISION(x) (((x) >> 4) & 0xf)
+#define SBOX_SCR9_DENSITY(x) (((x) >> 8) & 0x3)
+#define SBOX_SCR9_ECC(x) (((x) >> 29) & 0x1)
+
+bd_info_t *
+dev_to_bdi(struct device *dev)
+{
+ struct list_head *pos, *tmpq;
+ bd_info_t *bdi = NULL;
+ list_for_each_safe(pos, tmpq, &mic_data.dd_bdlist) {
+ bdi = list_entry(pos, bd_info_t, bi_list);
+ if (bdi->bi_sysfsdev == dev)
+ break;
+ }
+ return bdi;
+}
+
+/*
+ * sysfs entries in lieu of MMIO ioctl
+ */
+
+struct device_attribute_sbox {
+ struct device_attribute devattr;
+ uint32_t offset, mask, shift;
+};
+
+uint32_t
+bd_sbox_read(bd_info_t *bdi, uint32_t offset)
+{
+ uint32_t reg_value, ret;
+ ret = micpm_get_reference(&bdi->bi_ctx, true);
+ if (ret)
+ return -EAGAIN;
+ reg_value = SBOX_READ(bdi->bi_ctx.mmio.va, offset);
+ ret = micpm_put_reference(&bdi->bi_ctx);
+ if (ret)
+ return -EAGAIN;
+
+ return reg_value;
+}
+
+#define DEVICE_ATTR_SBOX(_name, _mode, _offset, _mask, _shift) \
+struct device_attribute_sbox sbox_attr_##_name = \
+{ __ATTR(_name, _mode, show_sbox_register, NULL), _offset, _mask, _shift }
+
+ssize_t
+show_sbox_register(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct device_attribute_sbox *attr_sbox = container_of(attr,
+ struct device_attribute_sbox, devattr);
+ bd_info_t *bdi = dev_to_bdi(dev);
+ return snprintf(buf, PAGE_SIZE, "%x\n",
+ (bd_sbox_read(bdi, attr_sbox->offset) >> attr_sbox->shift) & attr_sbox->mask);
+}
+
+#ifdef CONFIG_ML1OM
+static DEVICE_ATTR_SBOX(corevoltage, S_IRUGO, SBOX_COREVOLT, MASK_COREVOLT, SHIFT_COREVOLT);
+static DEVICE_ATTR_SBOX(corefrequency, S_IRUGO, SBOX_COREFREQ, MASK_COREFREQ, SHIFT_COREFREQ);
+#endif
+static DEVICE_ATTR_SBOX(memoryvoltage, S_IRUGO, SBOX_MEMVOLT, MASK_MEMVOLT, SHIFT_MEMVOLT);
+static DEVICE_ATTR_SBOX(memoryfrequency, S_IRUGO, SBOX_MEMORYFREQ, MASK_MEMORYFREQ, SHIFT_MEMORYFREQ);
+static DEVICE_ATTR_SBOX(memsize, S_IRUGO, SBOX_SCRATCH0, MASK_MEMSIZE, SHIFT_MEMSIZE);
+static DEVICE_ATTR_SBOX(flashversion, S_IRUGO, SBOX_SCRATCH7, MASK_FLASHVERSION, SHIFT_FLASHVERSION);
+
+/* HW Info */
+static DEVICE_ATTR_SBOX(substepping_data, S_IRUGO, SBOX_SCRATCH13, MASK_SUBSTEPPING_DATA, SHIFT_SUBSTEPPING_DATA);
+static DEVICE_ATTR_SBOX(stepping_data, S_IRUGO, SBOX_SCRATCH13, MASK_STEPPING_DATA, SHIFT_STEPPING_DATA);
+static DEVICE_ATTR_SBOX(model, S_IRUGO, SBOX_SCRATCH13, MASK_MODEL, SHIFT_MODEL);
+static DEVICE_ATTR_SBOX(family_data, S_IRUGO, SBOX_SCRATCH13, MASK_FAMILY_DATA, SHIFT_FAMILY_DATA);
+static DEVICE_ATTR_SBOX(processor, S_IRUGO, SBOX_SCRATCH13, MASK_PROCESSOR, SHIFT_PROCESSOR);
+static DEVICE_ATTR_SBOX(platform, S_IRUGO, SBOX_SCRATCH13, MASK_PLATFORM, SHIFT_PLATFORM);
+static DEVICE_ATTR_SBOX(extended_model, S_IRUGO, SBOX_SCRATCH13, MASK_EXTENDED_MODEL, SHIFT_EXTENDED_MODEL);
+static DEVICE_ATTR_SBOX(extended_family, S_IRUGO, SBOX_SCRATCH13, MASK_EXTENDED_FAMILY, SHIFT_EXTENDED_FAMILY);
+/* copy of fuse_configuration_revision [129:120] */
+static DEVICE_ATTR_SBOX(fuse_config_rev, S_IRUGO, SBOX_SCRATCH7, MASK_FUSE_CONFIG_REV, SHIFT_FUSE_CONFIG_REV);
+
+static DEVICE_ATTR_SBOX(active_cores, S_IRUGO, SBOX_SCRATCH4, MASK_ACTIVE_CORES, SHIFT_ACTIVE_CORES);
+static DEVICE_ATTR_SBOX(fail_safe_offset, S_IRUSR, SBOX_FAIL_SAFE_OFFSET, MASK_FAIL_SAFE, SHIFT_FAIL_SAFE);
+
+ssize_t show_flash_update(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ uint32_t value, ret;
+ bd_info_t *bdi = dev_to_bdi(dev);
+ ret = micpm_get_reference(&bdi->bi_ctx, true);
+ if (ret)
+ return -EAGAIN;
+ value = DBOX_READ(bdi->bi_ctx.mmio.va, DBOX_SWF0X0);
+ ret = micpm_put_reference(&bdi->bi_ctx);
+ if (ret)
+ return -EAGAIN;
+
+ return snprintf(buf, PAGE_SIZE, "%x\n", value);
+}
+
+static ssize_t
+set_flash_update(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+ unsigned long value;
+ int ret;
+ bd_info_t *bdi = dev_to_bdi(dev);
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,39)
+ ret = kstrtoul(buf, 0, &value);
+ if (ret)
+ return count;
+#else
+ value = simple_strtoul(buf, NULL, 10);
+#endif
+ ret = micpm_get_reference(&bdi->bi_ctx, true);
+ if (ret)
+ return -EAGAIN;
+ DBOX_WRITE((unsigned int)value, bdi->bi_ctx.mmio.va, DBOX_SWF0X0);
+ ret = micpm_put_reference(&bdi->bi_ctx);
+ if (ret)
+ return -EAGAIN;
+
+ return count;
+
+}
+static DEVICE_ATTR(flash_update, S_IRUSR | S_IWUSR, show_flash_update, set_flash_update);
+
+ssize_t
+show_meminfo(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ uint32_t value;
+ bd_info_t *bdi = dev_to_bdi(dev);
+ value = bd_sbox_read(bdi, SBOX_SCRATCH9);
+ return snprintf(buf, PAGE_SIZE, "vendor:%x,revision:%x"
+ ",density:%x,ecc_enable:%x",
+ SBOX_SCR9_VENDORID(value), SBOX_SCR9_REVISION(value),
+ SBOX_SCR9_DENSITY(value), SBOX_SCR9_ECC(value));
+}
+static DEVICE_ATTR(meminfo, S_IRUGO, show_meminfo, NULL);
+
+ssize_t
+show_sku(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ return snprintf(buf, PAGE_SIZE, "%s\n", bdi->bi_ctx.sku_name);
+}
+static DEVICE_ATTR(sku, S_IRUGO, show_sku, NULL);
+/******************************************************************************/
+
+static ssize_t
+show_version(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%s\n", BUILD_VERSION);
+}
+static DEVICE_ATTR(version, S_IRUGO, show_version, NULL);
+
+static ssize_t
+show_p2p(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%s\n", mic_p2p_enable? "enable" : "disable");
+}
+static DEVICE_ATTR(peer2peer, S_IRUGO, show_p2p, NULL);
+
+static struct attribute *host_attributes[] = {
+ &dev_attr_version.attr,
+ &dev_attr_peer2peer.attr,
+ NULL
+};
+
+struct attribute_group host_attr_group = {
+ .attrs = host_attributes
+};
+
+static ssize_t
+show_family(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ static const char KNF[] = "Knights Ferry";
+ static const char KNC[] = "x100";
+ bd_info_t *bdi = dev_to_bdi(dev);
+ const char *card = NULL;
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+
+ if (mic_ctx->bi_family == FAMILY_ABR)
+ card = KNF;
+ else
+ card = KNC;
+
+ if (card)
+ return snprintf(buf, PAGE_SIZE, "%s\n", card);
+ else
+ return snprintf(buf, PAGE_SIZE, "Unknown\n");
+}
+static DEVICE_ATTR(family, S_IRUGO, show_family, NULL);
+
+static ssize_t
+show_stepping(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ char string[3];
+ show_stepping_comm(&bdi->bi_ctx,string);
+ return snprintf(buf, PAGE_SIZE, "%s\n", string);
+}
+static DEVICE_ATTR(stepping, S_IRUGO, show_stepping, NULL);
+
+char *micstates[] = {"ready", "booting", "no response", "boot failed",
+ "online", "shutdown", "lost", "resetting", "reset failed", "invalid"};
+static ssize_t
+show_micstate(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+
+ if (bdi->bi_ctx.state >= MIC_INVALID)
+ mic_setstate(&bdi->bi_ctx, MIC_INVALID);
+ return snprintf(buf, PAGE_SIZE, "%s", micstates[bdi->bi_ctx.state]);
+}
+
+static int
+match_micstate(const char **buf, const char *string)
+{
+ size_t len = strlen(string);
+ if (!strncmp(*buf, string, len)) {
+ *buf += len;
+ return true;
+ }
+ return false;
+}
+
+static ssize_t
+set_micstate(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+ const char *default_mm_image = "/usr/share/mpss/boot/rasmm-kernel.from-eeprom.elf";
+
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+ int mode;
+ size_t len;
+ char *arg, *arg2 = NULL;
+ int err = 0;
+
+ /* parse the new state */
+ if (match_micstate(&buf, "boot:linux:")) {
+ mode = MODE_LINUX;
+ } else if (match_micstate(&buf, "boot:elf:")) {
+ mode = MODE_ELF;
+ } else if (match_micstate(&buf, "boot:flash:")) {
+ mode = MODE_FLASH;
+ } else if (sysfs_streq(buf, "reset")) {
+
+ mutex_lock(&mic_ctx->state_lock);
+ if (mic_ctx->state == MIC_READY) {
+ mutex_unlock(&mic_ctx->state_lock);
+ return -EINVAL;
+ }
+
+ mutex_unlock(&mic_ctx->state_lock);
+ adapter_stop_device(mic_ctx, 1, 0);
+ return count;
+ } else if (sysfs_streq(buf, "reset:force")) {
+ int reattempt = !RESET_REATTEMPT;
+
+ mutex_lock(&mic_ctx->state_lock);
+ if (mic_ctx->state == MIC_READY)
+ reattempt = RESET_REATTEMPT;
+
+ mutex_unlock(&mic_ctx->state_lock);
+ adapter_stop_device(mic_ctx, 1, reattempt);
+ return count;
+ } else if (sysfs_streq(buf, "shutdown")) {
+ adapter_shutdown_device(mic_ctx);
+ return count;
+ } else {
+ return -EINVAL;
+ }
+
+ /* we're booting something; a filename follows the colon */
+ len = strlen(buf);
+ if (buf && buf[0] == '\n') {
+ len = 0;
+ }
+ if (!len && mode == MODE_FLASH) {
+ buf = default_mm_image;
+ len = strlen(buf);
+ }
+ if (!(arg = kmalloc(len + 1, GFP_KERNEL)))
+ return -ENOMEM;
+ memcpy(arg, buf, len + 1);
+ if (arg[len - 1] == '\n')
+ arg[len - 1] = '\0';
+
+ /* if booting linux, there may be yet another filename */
+ if (mode == MODE_LINUX && (arg2 = strchr(arg, ':')))
+ *arg2++ = '\0';
+
+ /* atomically change the state */
+ mutex_lock(&mic_ctx->state_lock);
+ if (mic_ctx->state == MIC_READY) {
+ kfree(mic_ctx->image);
+ mic_ctx->mode = mode;
+ mic_ctx->image = arg;
+ mic_ctx->initramfs = arg2;
+ mic_setstate(mic_ctx, MIC_BOOT);
+ mutex_unlock(&mic_ctx->state_lock);
+ printk("mic image: %s\n", mic_ctx->image);
+ } else {
+ kfree(arg);
+ printk(KERN_ERR "Error! Card not in offline/ready state. Cannot change mode\n");
+ mutex_unlock(&mic_ctx->state_lock);
+ return -EIO;
+ }
+
+ /* actually perform the boot */
+ if (mode == MODE_LINUX) {
+ mic_ctx->card_usage_mode = USAGE_MODE_NORMAL;
+ err = boot_linux_uos(mic_ctx, mic_ctx->image, mic_ctx->initramfs);
+ if (!err)
+ adapter_post_boot_device(mic_ctx);
+ } else {
+ err = boot_micdev_app(mic_ctx, mic_ctx->image);
+ }
+
+ if (!err)
+ return count;
+ printk("booting failed %d\n", err);
+ return err;
+}
+static DEVICE_ATTR(state, S_IRUGO|S_IWUSR, show_micstate, set_micstate);
+
+char *micmodes[] = {"N/A", "linux", "elf", "flash"};
+
+static ssize_t
+show_mode(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+
+ if (bdi->bi_ctx.mode > MODE_FLASH)
+ bdi->bi_ctx.mode = MODE_NONE;
+ return snprintf(buf, PAGE_SIZE, "%s", micmodes[bdi->bi_ctx.mode]);
+}
+static DEVICE_ATTR(mode, S_IRUGO, show_mode, NULL);
+
+int scif_get_node_status(int node_id);
+static char *scif_status_stings[] = {"not present", "initializing", "online",
+ "sleeping", "stopping", "stopped"};
+static ssize_t
+show_scif_status(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ int scif_status;
+
+ scif_status = scif_get_node_status(bdi->bi_ctx.bi_id + 1);
+ return snprintf(buf, PAGE_SIZE, "%s\n", scif_status_stings[scif_status]);
+}
+static DEVICE_ATTR(scif_status, S_IRUGO, show_scif_status, NULL);
+
+static ssize_t
+show_image(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ return snprintf(buf, PAGE_SIZE, "%s", bdi->bi_ctx.image);
+}
+static DEVICE_ATTR(image, S_IRUGO, show_image, NULL);
+
+static ssize_t
+show_initramfs(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ return snprintf(buf, PAGE_SIZE, "%s", bdi->bi_ctx.initramfs);
+}
+static DEVICE_ATTR(initramfs, S_IRUGO, show_initramfs, NULL);
+
+static ssize_t
+show_postcode(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+ uint32_t postcode;
+
+ if ((micpm_get_reference(mic_ctx, true))) {
+ PM_DEBUG("get_reference failed. Node may be lost\n");
+ return -EBUSY;
+ }
+ postcode = mic_getpostcode(mic_ctx);
+ if (postcode == 0xffffffff) {
+ printk("Invalid Postcode : %c%c\n", postcode & 0xff, (postcode >> 8) & 0xff);
+ micpm_put_reference(mic_ctx);
+ return -ENXIO;
+ }
+
+ if (postcode == 0x0) {
+ printk("Postcode : %c%c\n", postcode & 0xff, (postcode >> 8) & 0xff);
+ micpm_put_reference(mic_ctx);
+ return -EAGAIN;
+ }
+ micpm_put_reference(mic_ctx);
+ return snprintf(buf, PAGE_SIZE, "%c%c", postcode & 0xff, (postcode >> 8) & 0xff);
+}
+static DEVICE_ATTR(post_code, S_IRUGO, show_postcode, NULL);
+
+static ssize_t
+show_boot_count(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+
+ return snprintf(buf, PAGE_SIZE, "%d", mic_ctx->boot_count);
+}
+static DEVICE_ATTR(boot_count, S_IRUGO, show_boot_count, NULL);
+
+static ssize_t
+show_crash_count(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+
+ return snprintf(buf, PAGE_SIZE, "%d", mic_ctx->crash_count);
+}
+static DEVICE_ATTR(crash_count, S_IRUGO, show_crash_count, NULL);
+
+static ssize_t
+show_cmdline(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+ char *cmdline = mic_ctx->sysfs_info.cmdline;
+
+ if (cmdline == NULL) {
+ return snprintf(buf, PAGE_SIZE, "not set\n");
+ } else {
+ return snprintf(buf, PAGE_SIZE, "%s\n", cmdline);
+ }
+ return 0;
+}
+
+static ssize_t
+set_cmdline(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+
+ if (mic_ctx->sysfs_info.cmdline != NULL)
+ kfree(mic_ctx->sysfs_info.cmdline);
+
+ if ((mic_ctx->sysfs_info.cmdline = kmalloc(count + 100, GFP_ATOMIC)) == NULL)
+ return -ENOMEM;
+ strcpy(mic_ctx->sysfs_info.cmdline, buf);
+
+ if (mic_ctx->sysfs_info.cmdline[count - 1] == '\n')
+ mic_ctx->sysfs_info.cmdline[count - 1] = '\0';
+
+ return count;
+}
+static DEVICE_ATTR(cmdline, S_IRUGO|S_IWUSR, show_cmdline, set_cmdline);
+
+static ssize_t
+show_kernel_cmdline(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+ char *cmdline = mic_ctx->sysfs_info.kernel_cmdline;
+
+ if ((mic_ctx->state == MIC_READY) || (cmdline == NULL)) {
+ return snprintf(buf, PAGE_SIZE, "ready\n");
+ } else {
+ return snprintf(buf, PAGE_SIZE, "%s\n", cmdline);
+ }
+}
+static DEVICE_ATTR(kernel_cmdline, S_IRUGO, show_kernel_cmdline, NULL);
+
+static ssize_t show_pc3_enabled(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+ return snprintf(buf, PAGE_SIZE, "%d\n", mic_ctx->micpm_ctx.pc3_enabled);
+}
+static ssize_t
+store_pc3_enabled(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+ int i, ret;
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+
+ if(sscanf(buf, "%d", &i) != 1) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ if (i < 0) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ ret = micpm_update_pc3(mic_ctx, (i) ? true : false);
+ if (ret)
+ goto exit;
+
+ pr_debug("pc3_enabled = %d\n", mic_ctx->micpm_ctx.pc3_enabled);
+ ret = count;
+exit:
+ return ret;
+}
+static DEVICE_ATTR(pc3_enabled, S_IRUGO | S_IWUSR, show_pc3_enabled, store_pc3_enabled);
+
+static ssize_t show_pc6_enabled(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+ return snprintf(buf, PAGE_SIZE, "%d\n", mic_ctx->micpm_ctx.pc6_enabled);
+}
+
+static ssize_t
+store_pc6_enabled(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+ int i, ret;
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+
+ if(sscanf(buf, "%d", &i) != 1) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ if (i < 0) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ ret = micpm_update_pc6(mic_ctx, (i) ? true : false);
+ if (ret)
+ goto exit;
+
+ pr_debug("pc6_enabled = %d\n", mic_ctx->micpm_ctx.pc6_enabled);
+ ret = count;
+exit:
+ return ret;
+}
+
+static DEVICE_ATTR(pc6_enabled, S_IRUGO | S_IWUSR, show_pc6_enabled, store_pc6_enabled);
+
+static ssize_t show_pc6_timeout(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+ return snprintf(buf, PAGE_SIZE, "%u\n", mic_ctx->micpm_ctx.pc6_timeout);
+}
+static ssize_t
+store_pc6_timeout(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+ int i, ret;
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+
+ if(sscanf(buf, "%d", &i) != 1) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ if (i < 0) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ if (mic_ctx->micpm_ctx.pc6_timeout != i) {
+ mic_ctx->micpm_ctx.pc6_timeout = i;
+ }
+ pr_debug("pc6 timeout set to %us\n", mic_ctx->micpm_ctx.pc6_timeout);
+ ret = count;
+exit:
+ return ret;
+}
+static DEVICE_ATTR(pc6_timeout, S_IRUGO | S_IWUSR, show_pc6_timeout, store_pc6_timeout);
+
+static ssize_t show_log_buf_addr(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+
+ return snprintf(buf, PAGE_SIZE, "%p\n", mic_ctx->log_buf_addr);
+}
+
+static ssize_t
+store_log_buf_addr(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+ int ret;
+ uint64_t addr;
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+
+ if (sscanf(buf, "%llx", &addr) != 1) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ mic_ctx->log_buf_addr = (void*)addr;
+ ret = count;
+exit:
+ return ret;
+}
+static DEVICE_ATTR(log_buf_addr, S_IRUGO | S_IWUSR, show_log_buf_addr, store_log_buf_addr);
+
+static ssize_t show_log_buf_len(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+
+ return snprintf(buf, PAGE_SIZE, "%p\n", mic_ctx->log_buf_len);
+}
+
+static ssize_t
+store_log_buf_len(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+ int ret;
+ uint64_t addr;
+ bd_info_t *bdi = dev_to_bdi(dev);
+ mic_ctx_t *mic_ctx = &bdi->bi_ctx;
+
+ if (sscanf(buf, "%llx", &addr) != 1) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ mic_ctx->log_buf_len = (int*)addr;
+ ret = count;
+exit:
+ return ret;
+}
+static DEVICE_ATTR(log_buf_len, S_IRUGO | S_IWUSR, show_log_buf_len, store_log_buf_len);
+
+union serialnum {
+ uint32_t values[3];
+ char serial[13];
+};
+
+static ssize_t
+show_serialnumber(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bd_info_t *bdi = dev_to_bdi(dev);
+ union serialnum serial;
+ uint32_t ret;
+
+ memset(serial.serial, 0, sizeof(serial.serial));
+ ret = micpm_get_reference(&bdi->bi_ctx, true);
+ if (ret)
+ return -EAGAIN;
+ serial.values[0] = DBOX_READ(bdi->bi_ctx.mmio.va, DBOX_SWF1X0);
+ serial.values[1] = DBOX_READ(bdi->bi_ctx.mmio.va, DBOX_SWF1X1);
+ serial.values[2] = DBOX_READ(bdi->bi_ctx.mmio.va, DBOX_SWF1X2);
+ ret = micpm_put_reference(&bdi->bi_ctx);
+ if (ret)
+ return -EAGAIN;
+ return snprintf(buf, PAGE_SIZE, "%s", serial.serial);
+}
+static DEVICE_ATTR(serialnumber, S_IRUGO, show_serialnumber, NULL);
+
+static ssize_t
+show_interface_version(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%s", LINUX_INTERFACE_VERSION);
+}
+static DEVICE_ATTR(interface_version, S_IRUGO, show_interface_version, NULL);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34) || \
+ defined(RHEL_RELEASE_CODE)
+extern ssize_t show_virtblk_file(struct device *dev, struct device_attribute *attr, char *buf);
+extern ssize_t store_virtblk_file(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count);
+static DEVICE_ATTR(virtblk_file, S_IRUGO | S_IWUSR, show_virtblk_file, store_virtblk_file);
+#endif
+
+static struct attribute *bd_attributes[] = {
+ &dev_attr_family.attr,
+ &dev_attr_stepping.attr,
+ &dev_attr_state.attr,
+ &dev_attr_mode.attr,
+ &dev_attr_image.attr,
+ &dev_attr_initramfs.attr,
+ &dev_attr_post_code.attr,
+ &dev_attr_boot_count.attr,
+ &dev_attr_crash_count.attr,
+ &dev_attr_cmdline.attr,
+ &dev_attr_kernel_cmdline.attr,
+ &dev_attr_serialnumber.attr,
+ &dev_attr_scif_status.attr,
+ &dev_attr_meminfo.attr,
+ &dev_attr_pc3_enabled.attr,
+ &dev_attr_pc6_enabled.attr,
+ &dev_attr_pc6_timeout.attr,
+ &dev_attr_flash_update.attr,
+ &dev_attr_log_buf_addr.attr,
+ &dev_attr_log_buf_len.attr,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34) || \
+ defined(RHEL_RELEASE_CODE)
+ &dev_attr_virtblk_file.attr,
+#endif
+ &dev_attr_sku.attr,
+ &dev_attr_interface_version.attr,
+
+#ifdef CONFIG_ML1OM
+ &sbox_attr_corevoltage.devattr.attr,
+ &sbox_attr_corefrequency.devattr.attr,
+#endif
+ &sbox_attr_memoryvoltage.devattr.attr,
+ &sbox_attr_memoryfrequency.devattr.attr,
+ &sbox_attr_memsize.devattr.attr,
+ &sbox_attr_flashversion.devattr.attr,
+ &sbox_attr_substepping_data.devattr.attr,
+ &sbox_attr_stepping_data.devattr.attr,
+ &sbox_attr_model.devattr.attr,
+ &sbox_attr_family_data.devattr.attr,
+ &sbox_attr_processor.devattr.attr,
+ &sbox_attr_platform.devattr.attr,
+ &sbox_attr_extended_model.devattr.attr,
+ &sbox_attr_extended_family.devattr.attr,
+ &sbox_attr_fuse_config_rev.devattr.attr,
+ &sbox_attr_active_cores.devattr.attr,
+ &sbox_attr_fail_safe_offset.devattr.attr,
+ NULL
+};
+
+struct attribute_group bd_attr_group = {
+ .attrs = bd_attributes
+};
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <linux/string.h>
+
+#include "mic/micscif_kmem_cache.h"
+#include "micint.h"
+#include "mic_common.h"
+#include "mic/io_interface.h"
+#include "mic/mic_pm.h"
+#include "mic/micveth.h"
+
+MODULE_LICENSE("GPL");
+MODULE_INFO(build_number, BUILD_NUMBER);
+MODULE_INFO(build_bywhom, BUILD_BYWHOM);
+MODULE_INFO(build_ondate, BUILD_ONDATE);
+MODULE_INFO(build_scmver, BUILD_SCMVER);
+
+#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,34))
+#include <linux/pm_qos_params.h>
+#endif
+
+struct kmem_cache *unaligned_cache;
+mic_lindata_t mic_lindata;
+
+module_param_named(ulimit, mic_ulimit_check, bool, 0600);
+MODULE_PARM_DESC(ulimit, "SCIF ulimit check");
+
+module_param_named(reg_cache, mic_reg_cache_enable, bool, 0600);
+MODULE_PARM_DESC(reg_cache, "SCIF registration caching");
+
+module_param_named(huge_page, mic_huge_page_enable, bool, 0600);
+MODULE_PARM_DESC(huge_page, "SCIF Huge Page Support");
+
+extern bool mic_p2p_enable;
+module_param_named(p2p, mic_p2p_enable, bool, 0600);
+MODULE_PARM_DESC(p2p, "SCIF peer-to-peer");
+
+extern bool mic_p2p_proxy_enable;
+module_param_named(p2p_proxy, mic_p2p_proxy_enable, bool, 0600);
+MODULE_PARM_DESC(p2p_proxy, "SCIF peer-to-peer proxy DMA support");
+
+extern bool mic_watchdog_enable;
+module_param_named(watchdog, mic_watchdog_enable, bool, 0600);
+MODULE_PARM_DESC(watchdog, "SCIF Watchdog");
+
+extern bool mic_watchdog_auto_reboot;
+module_param_named(watchdog_auto_reboot, mic_watchdog_auto_reboot, bool, 0600);
+MODULE_PARM_DESC(watchdog_auto_reboot, "SCIF Watchdog auto reboot");
+
+bool mic_msi_enable = 1;
+module_param_named(msi, mic_msi_enable, bool, 0600);
+MODULE_PARM_DESC(mic_msi_enable, "To enable MSIx in the driver.");
+
+int mic_pm_qos_cpu_dma_lat = -1;
+module_param_named(pm_qos_cpu_dma_lat, mic_pm_qos_cpu_dma_lat, int, 0600);
+MODULE_PARM_DESC(mic_pm_qos_cpu_dma_lat, "PM QoS CPU DMA latency in usecs.");
+
+extern int ramoops_count;
+module_param_named(ramoops_count, ramoops_count, int, 0600);
+MODULE_PARM_DESC(ramoops_count, "Maximum frame count for the ramoops driver.");
+
+extern bool mic_crash_dump_enabled;
+module_param_named(crash_dump, mic_crash_dump_enabled, bool, 0600);
+MODULE_PARM_DESC(mic_crash_dump_enabled, "MIC Crash Dump enabled.");
+
+#define GET_FILE_SIZE_FROM_INODE(fp) i_size_read((fp)->f_path.dentry->d_inode)
+
+int usagemode_param = 0;
+
+static int
+mic_open(struct inode *inode, struct file *filp)
+{
+ dev_t dev = inode->i_rdev;
+
+ switch (MINOR(dev)) {
+ case 0:
+ return 0;
+ case 1:
+ return scif_fdopen(filp);
+ case 2:
+ return mic_psmi_open(filp);
+ }
+
+ return -EINVAL;
+}
+
+static int
+mic_release(struct inode *inode, struct file *filp)
+{
+ dev_t dev = inode->i_rdev;
+ int rc = 0;
+
+ switch (MINOR(dev)) {
+ case 0:
+ if (filp->private_data == filp) {
+ // Fasync is set
+ rc = fasync_helper(-1, filp, 0, &mic_data.dd_fasync);
+ mic_data.dd_fasync = NULL;
+ }
+ return rc;
+ case 1:
+ return scif_fdclose(filp);
+ case 2:
+ // psmi access to device
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+extern ssize_t mic_psmi_read(struct file * filp, char __user *buf,
+ size_t count, loff_t *pos);
+static ssize_t
+mic_read(struct file * filp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ dev_t dev = filp->f_path.dentry->d_inode->i_rdev;
+ if (MINOR(dev) == 2)
+ return mic_psmi_read(filp, buf, count, pos);
+
+ return -EINVAL;
+}
+
+static long
+mic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ dev_t dev;
+ int status = 0;
+
+ dev = filp->f_path.dentry->d_inode->i_rdev;
+ if (MINOR(dev) == 1)
+ return scif_process_ioctl(filp, cmd, arg);
+
+ if (MINOR(dev) == 2)
+ return -EINVAL;
+
+ status = adapter_do_ioctl(cmd, arg);
+ return status;
+}
+
+static int
+mic_fasync(int fd, struct file *filp, int on)
+{
+ int rc;
+
+ if ((rc = fasync_helper(fd, filp, on, &mic_data.dd_fasync)) < 0) {
+ return rc;
+ }
+
+ if (on) {
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0))
+ rc = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+#else
+ __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+#endif
+ filp->private_data = filp;
+ } else {
+ filp->private_data = NULL;
+ }
+
+ return rc;
+}
+
+int
+mic_mmap(struct file *f, struct vm_area_struct *vma)
+{
+ dev_t dev = f->f_path.dentry->d_inode->i_rdev;
+ if (MINOR(dev) == 1)
+ return micscif_mmap(f, vma);
+
+ return -EINVAL;
+}
+
+unsigned int
+mic_poll(struct file *f, poll_table *wait)
+{
+ dev_t dev = f->f_path.dentry->d_inode->i_rdev;
+ if (MINOR(dev) == 1)
+ return micscif_poll(f, wait);
+
+ return -EINVAL;
+}
+
+int
+mic_flush(struct file *f, fl_owner_t id)
+{
+ dev_t dev = f->f_path.dentry->d_inode->i_rdev;
+ if (MINOR(dev) == 1)
+ return micscif_flush(f, id);
+
+ return -EINVAL;
+}
+
+irqreturn_t
+mic_irq_isr(int irq, void *data)
+{
+ if (((mic_ctx_t *)data)->msie)
+ adapter_imsr((mic_ctx_t *)data);
+ else if (adapter_isr((mic_ctx_t *)data) < 0 ){
+ return IRQ_NONE;
+ }
+
+ return IRQ_HANDLED;
+}
+
+extern struct attribute_group bd_attr_group;
+extern struct attribute_group host_attr_group;
+extern struct attribute_group scif_attr_group;
+extern struct attribute_group psmi_attr_group;
+extern struct bin_attribute mic_psmi_ptes_attr;
+
+static int
+mic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ int brdnum = mic_data.dd_numdevs;
+ int err = 0;
+ bd_info_t *bd_info;
+ mic_ctx_t *mic_ctx;
+#ifdef CONFIG_PCI_MSI
+ int i=0;
+#endif
+ if ((bd_info = (bd_info_t *)kzalloc(sizeof(bd_info_t), GFP_KERNEL)) == NULL) {
+ printk("MIC: probe failed allocating memory for bd_info\n");
+ return -ENOSPC;
+ }
+
+ mic_ctx = &bd_info->bi_ctx;
+ mic_ctx->bd_info = bd_info;
+ mic_ctx->bi_id = brdnum;
+ mic_ctx->bi_pdev = pdev;
+ mic_ctx->msie = 0;
+ mic_data.dd_bi[brdnum] = bd_info;
+
+ if ((err = pci_enable_device(pdev))) {
+ printk("pci_enable failed board #%d\n", brdnum);
+ goto probe_freebd;
+ }
+
+ pci_set_master(pdev);
+ err = pci_reenable_device(pdev);
+ err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (err) {
+ printk("mic %d: ERROR DMA not available\n", brdnum);
+ goto probe_freebd;
+ }
+ err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (err) {
+ printk("mic %d: ERROR pci_set_consistent_dma_mask(64) %d\n", brdnum, err);
+ err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+ if (err) {
+ printk("mic %d: ERROR pci_set_consistent_dma_mask(32) %d\n", brdnum, err);
+ goto probe_freebd;
+ }
+ }
+
+ // Allocate bar 4 for MMIO and GTT
+ bd_info->bi_ctx.mmio.pa = pci_resource_start(pdev, DLDR_MMIO_BAR);
+ bd_info->bi_ctx.mmio.len = pci_resource_len(pdev, DLDR_MMIO_BAR);
+ if (request_mem_region(bd_info->bi_ctx.mmio.pa,
+ bd_info->bi_ctx.mmio.len, "mic") == NULL) {
+ printk("mic %d: failed to reserve mmio space\n", brdnum);
+ goto probe_freebd;
+ }
+
+ // Allocate bar 0 for access Aperture
+ bd_info->bi_ctx.aper.pa = pci_resource_start(pdev, DLDR_APT_BAR);
+ bd_info->bi_ctx.aper.len = pci_resource_len(pdev, DLDR_APT_BAR);
+ if (request_mem_region(bd_info->bi_ctx.aper.pa,
+ bd_info->bi_ctx.aper.len, "mic") == NULL) {
+ printk("mic %d: failed to reserve aperture space\n", brdnum);
+ goto probe_relmmio;
+ }
+
+#ifdef CONFIG_PCI_MSI
+ if (mic_msi_enable){
+ for (i = 0; i < MIC_NUM_MSIX_ENTRIES; i ++)
+ bd_info->bi_msix_entries[i].entry = i;
+ err = pci_enable_msix(mic_ctx->bi_pdev, bd_info->bi_msix_entries,
+ MIC_NUM_MSIX_ENTRIES);
+ if (err == 0 ) {
+ // Only support 1 MSIx for now
+ err = request_irq(bd_info->bi_msix_entries[0].vector,
+ mic_irq_isr, 0, "mic", mic_ctx);
+ if (err != 0) {
+ printk("MIC: Error in request_irq %d\n", err);
+ goto probe_relaper;
+ }
+ mic_ctx->msie = 1;
+ }
+ }
+#endif
+
+ // TODO: this needs to be hardened and actually return errors
+ if ((err = adapter_init_device(mic_ctx)) != 0) {
+ printk("MIC: Adapter init device failed %d\n", err);
+ goto probe_relaper;
+ }
+
+ // Adding sysfs entries
+ set_sysfs_entries(mic_ctx);
+
+ bd_info->bi_sysfsdev = device_create(mic_lindata.dd_class, &pdev->dev,
+ mic_lindata.dd_dev + 2 + mic_ctx->bd_info->bi_ctx.bi_id,
+ NULL, "mic%d", mic_ctx->bd_info->bi_ctx.bi_id);
+ err = sysfs_create_group(&mic_ctx->bd_info->bi_sysfsdev->kobj, &bd_attr_group);
+ mic_ctx->sysfs_state = sysfs_get_dirent(mic_ctx->bd_info->bi_sysfsdev->kobj.sd,
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,35) && LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0))
+ NULL,
+#endif
+ "state");
+
+ dev_set_drvdata(mic_ctx->bd_info->bi_sysfsdev, mic_ctx);
+
+ if (!mic_ctx->msie)
+ if ((err = request_irq(mic_ctx->bi_pdev->irq, mic_irq_isr,
+ IRQF_SHARED, "mic", mic_ctx)) != 0) {
+ printk("MIC: Error in request_irq %d\n", err);
+ goto probe_unmapaper;
+ }
+
+ adapter_probe(&bd_info->bi_ctx);
+
+ if (mic_ctx->bi_psmi.enabled) {
+ err = sysfs_create_group(&mic_ctx->bd_info->bi_sysfsdev->kobj,
+ &psmi_attr_group);
+ err = device_create_bin_file(mic_ctx->bd_info->bi_sysfsdev,
+ &mic_psmi_ptes_attr);
+ }
+
+ adapter_wait_reset(mic_ctx);
+
+ // Adding a board instance so increment the total number of MICs in the system.
+ list_add_tail(&bd_info->bi_list, &mic_data.dd_bdlist);
+ mic_data.dd_numdevs++;
+ printk("mic_probe %d:%d:%d as board #%d\n", pdev->bus->number,
+ PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), brdnum);
+ return 0;
+
+probe_unmapaper:
+ wait_event(mic_ctx->ioremapwq, mic_ctx->aper.va || mic_ctx->state == MIC_RESETFAIL);
+ if (mic_ctx->aper.va)
+ iounmap((void *)bd_info->bi_ctx.aper.va);
+ iounmap((void *)bd_info->bi_ctx.mmio.va);
+
+probe_relaper:
+ release_mem_region(bd_info->bi_ctx.aper.pa, bd_info->bi_ctx.aper.len);
+
+probe_relmmio:
+ release_mem_region(bd_info->bi_ctx.mmio.pa, bd_info->bi_ctx.mmio.len);
+
+probe_freebd:
+ kfree(bd_info);
+ return err;
+}
+
+static void
+mic_remove(struct pci_dev *pdev)
+{
+ int32_t brdnum;
+ bd_info_t *bd_info;
+
+ if (mic_data.dd_numdevs - 1 < 0)
+ return;
+ mic_data.dd_numdevs--;
+ brdnum = mic_data.dd_numdevs;
+
+ /* Make sure boards are shutdown and not available. */
+ bd_info = mic_data.dd_bi[brdnum];
+
+ spin_lock_bh(&bd_info->bi_ctx.sysfs_lock);
+ sysfs_put(bd_info->bi_ctx.sysfs_state);
+ bd_info->bi_ctx.sysfs_state = NULL;
+ spin_unlock_bh(&bd_info->bi_ctx.sysfs_lock);
+
+ if (bd_info->bi_ctx.bi_psmi.enabled) {
+ device_remove_bin_file(bd_info->bi_sysfsdev, &mic_psmi_ptes_attr);
+ sysfs_remove_group(&bd_info->bi_sysfsdev->kobj, &psmi_attr_group);
+ }
+ sysfs_remove_group(&bd_info->bi_sysfsdev->kobj, &bd_attr_group);
+
+ free_sysfs_entries(&bd_info->bi_ctx);
+ device_destroy(mic_lindata.dd_class,
+ mic_lindata.dd_dev + 2 + bd_info->bi_ctx.bi_id);
+
+ adapter_stop_device(&bd_info->bi_ctx, 1, 0);
+ /*
+ * Need to wait for reset since accessing the card while GDDR training
+ * is ongoing by adapter_remove(..) below for example can be fatal.
+ */
+ wait_for_reset(&bd_info->bi_ctx);
+
+ mic_disable_interrupts(&bd_info->bi_ctx);
+
+ if (!bd_info->bi_ctx.msie) {
+ free_irq(bd_info->bi_ctx.bi_pdev->irq, &bd_info->bi_ctx);
+#ifdef CONFIG_PCI_MSI
+ } else {
+ free_irq(bd_info->bi_msix_entries[0].vector, &bd_info->bi_ctx);
+ pci_disable_msix(bd_info->bi_ctx.bi_pdev);
+#endif
+ }
+ adapter_remove(&bd_info->bi_ctx);
+ release_mem_region(bd_info->bi_ctx.aper.pa, bd_info->bi_ctx.aper.len);
+ release_mem_region(bd_info->bi_ctx.mmio.pa, bd_info->bi_ctx.mmio.len);
+ pci_disable_device(bd_info->bi_ctx.bi_pdev);
+ kfree(bd_info);
+}
+
+static void
+mic_shutdown(struct pci_dev *pdev) {
+ mic_ctx_t *mic_ctx;
+ mic_ctx = get_device_context(pdev);
+
+ if(!mic_ctx)
+ return;
+
+ adapter_stop_device(mic_ctx, !RESET_WAIT , !RESET_REATTEMPT);
+ return;
+}
+static const struct file_operations mic_fops = {
+ .open = mic_open,
+ .release = mic_release,
+ .read = mic_read,
+ .unlocked_ioctl = mic_ioctl,
+ .fasync = mic_fasync,
+ .mmap = mic_mmap,
+ .poll = mic_poll,
+ .flush = mic_flush,
+ .owner = THIS_MODULE,
+};
+
+static const struct dev_pm_ops pci_dev_pm_ops = {
+ .suspend = micpm_suspend,
+ .resume = micpm_resume,
+ .freeze = micpm_suspend,
+ .restore = micpm_resume,
+ .suspend_noirq = micpm_suspend_noirq,
+ .resume_noirq = micpm_resume_noirq,
+ .freeze_noirq = micpm_suspend_noirq,
+ .restore_noirq = micpm_resume_noirq,
+};
+
+static struct notifier_block mic_pm_notifer = {
+ .notifier_call = micpm_notifier_block,
+};
+
+static struct pci_device_id mic_pci_tbl[] = {
+#ifdef CONFIG_ML1OM
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ABR_2249, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ABR_224a, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+#endif
+#ifdef CONFIG_MK1OM
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2250, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2251, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2252, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2253, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2254, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2255, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2256, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2257, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2258, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_2259, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_225a, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_225b, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_225c, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_225d, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_KNC_225e, PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0, 0 },
+
+#endif
+ { 0, }
+};
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,31)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0)
+#define MODE_T umode_t
+#else
+#define MODE_T mode_t
+#endif
+static char *
+mic_devnode(struct device *dev, MODE_T *mode)
+{
+ return kasprintf(GFP_KERNEL, "mic/%s", dev_name(dev));
+}
+#undef MODE_T
+#endif
+
+static int __init
+mic_init(void)
+{
+ int ret, i;
+
+ adapter_init();
+
+ unaligned_cache = micscif_kmem_cache_create();
+ if (!unaligned_cache) {
+ ret = -ENOMEM;
+ goto init_free_ports;
+ }
+
+ mic_lindata.dd_pcidriver.name = "mic";
+ mic_lindata.dd_pcidriver.id_table = mic_pci_tbl;
+ mic_lindata.dd_pcidriver.probe = mic_probe;
+ mic_lindata.dd_pcidriver.remove = mic_remove;
+ mic_lindata.dd_pcidriver.driver.pm = &pci_dev_pm_ops;
+ mic_lindata.dd_pcidriver.shutdown = mic_shutdown;
+
+
+ if ((ret = alloc_chrdev_region(&mic_lindata.dd_dev,
+ 0, MAX_DLDR_MINORS, "mic") != 0)) {
+ printk("Error allocating device nodes: %d\n", ret);
+ goto init_free_ports;
+ }
+
+ cdev_init(&mic_lindata.dd_cdev, &mic_fops);
+ mic_lindata.dd_cdev.owner = THIS_MODULE;
+ mic_lindata.dd_cdev.ops = &mic_fops;
+
+ if ((ret = cdev_add(&mic_lindata.dd_cdev,
+ mic_lindata.dd_dev, MAX_DLDR_MINORS) != 0)) {
+ kobject_put(&mic_lindata.dd_cdev.kobj);
+ goto init_free_region;
+ }
+
+ mic_lindata.dd_class = class_create(THIS_MODULE, "mic");
+ if (IS_ERR(mic_lindata.dd_class)) {
+ printk("MICDLDR: Error createing mic class\n");
+ cdev_del(&mic_lindata.dd_cdev);
+ ret = PTR_ERR(mic_lindata.dd_class);
+ goto init_free_region;
+ }
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,31)
+ mic_lindata.dd_class->devnode = mic_devnode;
+#endif
+
+ mic_lindata.dd_hostdev = device_create(mic_lindata.dd_class, NULL,
+ mic_lindata.dd_dev, NULL, "ctrl");
+ mic_lindata.dd_scifdev = device_create(mic_lindata.dd_class, NULL,
+ mic_lindata.dd_dev + 1, NULL, "scif");
+ ret = sysfs_create_group(&mic_lindata.dd_hostdev->kobj, &host_attr_group);
+ ret = sysfs_create_group(&mic_lindata.dd_scifdev->kobj, &scif_attr_group);
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,31)
+ mic_lindata.dd_class->devnode = NULL;
+#endif
+
+ if (micveth_init(mic_lindata.dd_hostdev))
+ printk(KERN_ERR "%s: micveth_init failed\n", __func__);
+
+ ret = pci_register_driver(&mic_lindata.dd_pcidriver);
+ if (ret) {
+ micscif_destroy();
+ printk("mic: failed to register pci driver %d\n", ret);
+ goto clean_unregister;
+ }
+
+ if (!mic_data.dd_numdevs) {
+ printk("mic: No MIC boards present. SCIF available in loopback mode\n");
+ } else {
+ printk("mic: number of devices detected %d \n", mic_data.dd_numdevs);
+ }
+
+ for (i = 0; i < mic_data.dd_numdevs; i++) {
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(i);
+ wait_event(mic_ctx->ioremapwq,
+ mic_ctx->aper.va || mic_ctx->state == MIC_RESETFAIL);
+ destroy_workqueue(mic_ctx->ioremapworkq);
+ }
+
+ micveth_init_legacy(mic_data.dd_numdevs, mic_lindata.dd_hostdev);
+
+ ret = acptboot_init();
+
+#ifdef USE_VCONSOLE
+ micvcons_create(mic_data.dd_numdevs);
+#endif
+
+ /* Initialize Data structures for PM Disconnect */
+ ret = micpm_disconn_init(mic_data.dd_numdevs + 1);
+ if (ret)
+ printk(KERN_ERR "%s: Failed to initialize PM disconnect"
+ " data structures. PM may not work as expected."
+ " ret = %d\n", __func__, ret);
+ register_pm_notifier(&mic_pm_notifer);
+#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,34))
+ ret = pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, "mic", mic_pm_qos_cpu_dma_lat);
+ if (ret) {
+ printk(KERN_ERR "%s %d mic_pm_qos_cpu_dma_lat %d ret %d\n",
+ __func__, __LINE__, mic_pm_qos_cpu_dma_lat, ret);
+ ret = 0;
+ /* Dont fail driver load due to PM QoS API. Fall through */
+ }
+#endif
+ return 0;
+
+clean_unregister:
+ device_destroy(mic_lindata.dd_class, mic_lindata.dd_dev + 1);
+ device_destroy(mic_lindata.dd_class, mic_lindata.dd_dev);
+ class_destroy(mic_lindata.dd_class);
+ cdev_del(&mic_lindata.dd_cdev);
+ unregister_pm_notifier(&mic_pm_notifer);
+init_free_region:
+ unregister_chrdev_region(mic_lindata.dd_dev, MAX_DLDR_MINORS);
+init_free_ports:
+ micpm_uninit();
+ return ret;
+}
+
+static void __exit
+mic_exit(void)
+{
+ /* Close endpoints related to reverse registration */
+ acptboot_exit();
+
+#ifdef USE_VCONSOLE
+ micvcons_destroy(mic_data.dd_numdevs);
+#endif
+
+ pci_unregister_driver(&mic_lindata.dd_pcidriver);
+ micpm_uninit();
+
+ /* Uninit data structures for PM disconnect */
+ micpm_disconn_uninit(mic_data.dd_numdevs + 1);
+
+#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,34))
+ pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, "mic");
+#endif
+ micscif_kmem_cache_destroy();
+ vmcore_exit();
+ micveth_exit();
+ micscif_destroy();
+ ramoops_exit();
+
+ device_destroy(mic_lindata.dd_class, mic_lindata.dd_dev + 1);
+ device_destroy(mic_lindata.dd_class, mic_lindata.dd_dev);
+ class_destroy(mic_lindata.dd_class);
+ cdev_del(&mic_lindata.dd_cdev);
+ unregister_chrdev_region(mic_lindata.dd_dev, MAX_DLDR_MINORS);
+ unregister_pm_notifier(&mic_pm_notifer);
+ return;
+}
+
+void
+set_sysfs_entries(mic_ctx_t *mic_ctx)
+{
+ memset(&mic_ctx->sysfs_info, 0, sizeof(mic_ctx->sysfs_info));
+}
+
+void
+free_sysfs_entries(mic_ctx_t *mic_ctx)
+{
+ if (mic_ctx->image != NULL)
+ kfree(mic_ctx->image); /* mic_ctx->initramfs points into this buffer */
+ if (mic_ctx->sysfs_info.cmdline != NULL)
+ kfree(mic_ctx->sysfs_info.cmdline);
+ if (mic_ctx->sysfs_info.kernel_cmdline != NULL)
+ kfree(mic_ctx->sysfs_info.kernel_cmdline);
+}
+
+mic_ctx_t *
+get_per_dev_ctx(uint16_t node)
+{
+ /* TODO: Its important to check the upper bound of the dd_bi array as well.
+ * Cannot be done currently since not all calling functions to get_per_dev_ctx
+ * has the dd_numdevs set correctly. (See mic_ctx_map_single call in adapter_init_device
+ * thats callled even before dd_numdevs is incremented. */
+ return &mic_data.dd_bi[node]->bi_ctx;
+}
+
+int
+get_num_devs(mic_ctx_t *mic_ctx, uint32_t *num_devs)
+{
+ if (num_devs == NULL)
+ return -EINVAL;
+ if (copy_to_user(num_devs, &mic_data.dd_numdevs, sizeof(uint32_t)))
+ return -EFAULT;
+ return 0;
+}
+
+int
+mic_get_file_size(const char* fn, uint32_t* file_len)
+{
+ struct file *filp;
+ loff_t filp_size;
+ uint32_t status = 0;
+ mm_segment_t fs = get_fs();
+
+ set_fs(get_ds());
+
+ if (!fn || IS_ERR(filp = filp_open(fn, 0, 0))) {
+ status = EINVAL;
+ goto cleanup_fs;
+ }
+
+ filp_size = GET_FILE_SIZE_FROM_INODE(filp);
+ if (filp_size <= 0) {
+ status = EINVAL;
+ goto cleanup_filp;
+ }
+
+ *file_len = filp_size;
+cleanup_filp:
+ filp_close(filp, current->files);
+cleanup_fs:
+ set_fs(fs);
+ return status;
+}
+
+// loads file from hdd into pci physical memory
+int
+mic_load_file(const char* fn, uint8_t* buffer, uint32_t max_size)
+{
+ long c;
+ int status = 0;
+ struct file *filp;
+ loff_t filp_size, pos = 0;
+
+ mm_segment_t fs = get_fs();
+ set_fs(get_ds());
+
+ if (!fn || IS_ERR(filp = filp_open(fn, 0, 0))) {
+ status = EINVAL;
+ goto cleanup_fs;
+ }
+
+ filp_size = GET_FILE_SIZE_FROM_INODE(filp);
+ if (filp_size <= 0) {
+ goto cleanup_filp;
+ }
+
+ c = vfs_read(filp, buffer, filp_size, &pos);
+ if(c != (long)filp_size) {
+ status = -1; //FIXME
+ goto cleanup_filp;
+ }
+
+cleanup_filp:
+ filp_close(filp, current->files);
+cleanup_fs:
+ set_fs(fs);
+
+ return status;
+}
+
+module_init(mic_init);
+module_exit(mic_exit);
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "micint.h"
+
+/* TODO: Improve debug messages */
+
+static int micvcons_open(struct tty_struct * tty, struct file * filp);
+static void micvcons_close(struct tty_struct * tty, struct file * filp);
+static int micvcons_write(struct tty_struct * tty, const unsigned char *buf,
+ int count);
+static int micvcons_write_room(struct tty_struct *tty);
+static void micvcons_set_termios(struct tty_struct *tty, struct ktermios * old);
+static void micvcons_timeout(unsigned long);
+static void micvcons_throttle(struct tty_struct *tty);
+static void micvcons_unthrottle(struct tty_struct *tty);
+static void micvcons_wakeup_readbuf(struct work_struct *work);
+static int micvcons_resume(struct _mic_ctx_t *mic_ctx);
+
+static struct tty_operations micvcons_tty_ops = {
+ .open = micvcons_open,
+ .close = micvcons_close,
+ .write = micvcons_write,
+ .write_room = micvcons_write_room,
+ .set_termios = micvcons_set_termios,
+ .throttle = micvcons_throttle,
+ .unthrottle = micvcons_unthrottle,
+};
+
+static struct tty_driver *micvcons_tty = NULL;
+static u16 extra_timeout = 0;
+static u8 restart_timer_flag = MICVCONS_TIMER_RESTART;
+static struct timer_list vcons_timer;
+static struct list_head timer_list_head;
+static spinlock_t timer_list_lock;
+
+int
+micvcons_create(int num_bds)
+{
+ micvcons_port_t *port;
+ bd_info_t *bd_info;
+ int bd, ret = 0;
+ char wq_name[14];
+ struct device *dev;
+
+ INIT_LIST_HEAD(&timer_list_head);
+
+ if (micvcons_tty)
+ goto exit;
+
+ micvcons_tty = alloc_tty_driver(num_bds);
+ if (!micvcons_tty) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+ micvcons_tty->owner = THIS_MODULE;
+ micvcons_tty->driver_name = MICVCONS_DEVICE_NAME;
+ micvcons_tty->name = MICVCONS_DEVICE_NAME;
+ micvcons_tty->major = 0;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0))
+ micvcons_tty->minor_num = num_bds;
+#endif
+ micvcons_tty->minor_start = 0;
+ micvcons_tty->type = TTY_DRIVER_TYPE_SERIAL;
+ micvcons_tty->subtype = SERIAL_TYPE_NORMAL;
+ micvcons_tty->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
+ micvcons_tty->init_termios = tty_std_termios;
+ micvcons_tty->init_termios.c_iflag = IGNCR;
+ micvcons_tty->init_termios.c_oflag = 0;
+ micvcons_tty->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL;
+ micvcons_tty->init_termios.c_lflag = 0;
+
+ tty_set_operations(micvcons_tty, &micvcons_tty_ops);
+
+ if ((ret = tty_register_driver(micvcons_tty)) != 0) {
+ printk("Failed to register vcons tty driver\n");
+ put_tty_driver(micvcons_tty);
+ micvcons_tty = NULL;
+ goto exit;
+ }
+
+ for (bd = 0; bd < num_bds; bd++) {
+ port = &mic_data.dd_ports[bd];
+ port->dp_bdinfo = mic_data.dd_bi[bd];
+
+ spin_lock_init(&port->dp_lock);
+ mutex_init (&port->dp_mutex);
+
+ bd_info = (bd_info_t *)port->dp_bdinfo;
+ bd_info->bi_port = port;
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ tty_port_init(&port->port);
+ dev = tty_port_register_device(&port->port, micvcons_tty, bd, NULL);
+#else
+ dev = tty_register_device(micvcons_tty, bd, NULL);
+ if (IS_ERR(dev)) {
+ printk("Failed to register vcons tty device\n");
+ micvcons_destroy(bd);
+ ret = PTR_ERR(dev);
+ goto exit;
+ }
+#endif
+ snprintf(wq_name, sizeof(wq_name), "VCONS MIC %d", bd);
+ port->dp_wq = __mic_create_singlethread_workqueue(wq_name);
+ if (!port->dp_wq) {
+ printk(KERN_ERR "%s: create_singlethread_workqueue\n",
+ __func__);
+ tty_unregister_device(micvcons_tty, bd);
+ micvcons_destroy(bd);
+ ret = -ENOMEM;
+ goto exit;
+ }
+ INIT_WORK(&port->dp_wakeup_read_buf, micvcons_wakeup_readbuf);
+ }
+ vcons_timer.function = micvcons_timeout;
+ vcons_timer.data = (unsigned long)(&timer_list_head);
+ init_timer(&vcons_timer);
+exit:
+ return ret;
+}
+
+void micvcons_destroy(int num_bds)
+{
+ int bd, ret;
+ micvcons_port_t *port;
+
+ if (!micvcons_tty)
+ return;
+ for (bd = 0; bd < num_bds; bd++) {
+ port = &mic_data.dd_ports[bd];
+ destroy_workqueue(port->dp_wq);
+ tty_unregister_device(micvcons_tty, bd);
+ }
+ ret = tty_unregister_driver(micvcons_tty);
+ put_tty_driver(micvcons_tty);
+ micvcons_tty = NULL;
+
+ if (ret)
+ printk(KERN_ERR "tty unregister_driver failed with code %d\n", ret);
+}
+
+static int
+micvcons_open(struct tty_struct * tty, struct file * filp)
+{
+ micvcons_port_t *port = &mic_data.dd_ports[tty->index];
+ int ret = 0;
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(tty->index);
+
+ tty->driver_data = port;
+
+ mutex_lock(&port->dp_mutex);
+ spin_lock_bh(&port->dp_lock);
+
+ if ((filp->f_flags & O_ACCMODE) != O_RDONLY) {
+ if (port->dp_writer) {
+ ret = -EBUSY;
+ goto exit_locked;
+ }
+ port->dp_writer = filp;
+ port->dp_bytes = 0;
+ }
+
+ if ((filp->f_flags & O_ACCMODE) != O_WRONLY) {
+ if (port->dp_reader) {
+ ret = -EBUSY;
+ goto exit_locked;
+ }
+ port->dp_reader = filp;
+ port->dp_canread = 1;
+ }
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0))
+ tty->low_latency = 0;
+#endif
+
+ if (!port->dp_tty)
+ port->dp_tty = tty;
+ if (!port->dp_vcons)
+ port->dp_vcons = &mic_ctx->bi_vcons;
+ if (tty->count == 1) {
+ ret = micvcons_start(mic_ctx);
+ if (ret != 0)
+ goto exit_locked;
+ spin_lock(&timer_list_lock);
+ list_add_tail_rcu(&port->list_member, &timer_list_head);
+ if (list_is_singular(&timer_list_head)) {
+ restart_timer_flag = MICVCONS_TIMER_RESTART;
+ mod_timer(&vcons_timer, jiffies +
+ msecs_to_jiffies(MICVCONS_SHORT_TIMEOUT));
+ }
+ spin_unlock(&timer_list_lock);
+ }
+
+exit_locked:
+ spin_unlock_bh(&port->dp_lock);
+ mutex_unlock(&port->dp_mutex);
+ return ret;
+}
+
+static inline void
+micvcons_del_timer_entry(micvcons_port_t *port)
+{
+ spin_lock(&timer_list_lock);
+ list_del_rcu(&port->list_member);
+ if (list_empty(&timer_list_head)) {
+ restart_timer_flag = MICVCONS_TIMER_SHUTDOWN;
+ spin_unlock(&timer_list_lock);
+ del_timer_sync(&vcons_timer);
+ } else {
+ spin_unlock(&timer_list_lock);
+ }
+ synchronize_rcu();
+}
+
+static void
+micvcons_close(struct tty_struct * tty, struct file * filp)
+{
+ micvcons_port_t *port = (micvcons_port_t *)tty->driver_data;
+
+ mutex_lock(&port->dp_mutex);
+ if (tty->count == 1) {
+ micvcons_del_timer_entry(port);
+ flush_workqueue(port->dp_wq);
+ }
+ spin_lock_bh(&port->dp_lock);
+ if (port->dp_reader == filp)
+ port->dp_reader = 0;
+
+ if (port->dp_writer == filp)
+ port->dp_writer = 0;
+
+ if (tty->count == 1)
+ port->dp_tty = 0;
+ spin_unlock_bh(&port->dp_lock);
+ mutex_unlock(&port->dp_mutex);
+}
+
+static int
+micvcons_write(struct tty_struct * tty, const unsigned char *buf, int count)
+{
+ micvcons_port_t *port = (micvcons_port_t *)tty->driver_data;
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(tty->index);
+ int bytes=0, status;
+ struct vcons_buf *vcons_host_header;
+ u8 card_alive = 1;
+
+ spin_lock_bh(&port->dp_lock);
+ vcons_host_header = (struct vcons_buf *)port->dp_vcons->dc_hdr_virt;
+ if (vcons_host_header->mic_magic == MIC_VCONS_SLEEPING) {
+ status = micvcons_resume(mic_ctx);
+ if (status != 0) {
+ /* If card can not wakeup, it is dead. */
+ card_alive = 0;
+ goto exit;
+ }
+ }
+ if (vcons_host_header->mic_magic != MIC_VCONS_READY)
+ goto exit;
+ bytes = micvcons_port_write(port, buf, count);
+ if (bytes) {
+ mic_send_hvc_intr(mic_ctx);
+ extra_timeout = 0;
+ }
+exit:
+ spin_unlock_bh(&port->dp_lock);
+ if (!card_alive)
+ micvcons_del_timer_entry(port);
+ return bytes;
+}
+
+static int
+micvcons_write_room(struct tty_struct *tty)
+{
+ micvcons_port_t *port = (micvcons_port_t *)tty->driver_data;
+ int room;
+
+ spin_lock_bh(&port->dp_lock);
+ if (port->dp_out)
+ room = micscif_rb_space(port->dp_out);
+ else
+ room = 0;
+ spin_unlock_bh(&port->dp_lock);
+
+ return room;
+}
+
+static void
+micvcons_set_termios(struct tty_struct *tty, struct ktermios * old)
+{
+}
+
+static int
+micvcons_readchars(micvcons_port_t *port)
+{
+ int len, ret, get_count;
+ int bytes_total = 0;
+ int bytes_read = 0;
+ char buf[64];
+
+ for (;;) {
+ len = micscif_rb_count(port->dp_in, sizeof(buf));
+ if (!len)
+ break;
+ get_count = min(len, (int)sizeof(buf));
+ ret = micscif_rb_get_next(port->dp_in, buf, get_count);
+ micscif_rb_update_read_ptr(port->dp_in);
+ if (port->dp_reader && port->dp_canread) {
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ if ((bytes_read = tty_insert_flip_string(
+ &port->port, buf, get_count)) != 0)
+ tty_flip_buffer_push(&port->port);
+#else
+ bytes_read = tty_insert_flip_string(port->dp_tty,
+ buf, get_count);
+ tty_flip_buffer_push(port->dp_tty);
+#endif
+ bytes_total += bytes_read;
+ if (bytes_read != get_count) {
+ printk(KERN_WARNING "dropping characters: \
+ bytes_read %d, get_count %d\n",
+ bytes_read, get_count);
+ break;
+ }
+ }
+ }
+ return bytes_total;
+}
+
+static int
+micvcons_initport(micvcons_port_t *port)
+{
+ struct vcons_buf *vcons_host_header;
+ struct vcons_mic_header *vcons_mic_header;
+ char *mic_hdr, *mic_buf, *host_buf;
+
+ vcons_host_header = (struct vcons_buf *)port->dp_vcons->dc_hdr_virt;
+ if (!vcons_host_header) {
+ printk(KERN_ERR "vcons_host_header NULL\n");
+ return -EFAULT;
+ }
+
+ host_buf = (char *)port->dp_vcons->dc_buf_virt;
+ if (!host_buf) {
+ printk(KERN_ERR "host_buf NULL\n");
+ return -EFAULT;
+ }
+
+ if (port->dp_bdinfo->bi_ctx.bi_family == FAMILY_ABR) {
+ set_pci_aperture(&port->dp_bdinfo->bi_ctx,
+ (port->dp_bdinfo->bi_ctx.aper.len - PAGE_SIZE) >> PAGE_SHIFT,
+ vcons_host_header->i_hdr_addr & PAGE_MASK, PAGE_SIZE);
+ mic_hdr = port->dp_bdinfo->bi_ctx.aper.va +
+ port->dp_bdinfo->bi_ctx.aper.len - PAGE_SIZE;
+ mic_buf = mic_hdr + PAGE_SIZE/2;
+ } else {
+ mic_hdr = port->dp_bdinfo->bi_ctx.aper.va + vcons_host_header->i_hdr_addr;
+ mic_buf = port->dp_bdinfo->bi_ctx.aper.va + vcons_host_header->i_buf_addr;
+ }
+
+ port->dp_in = kmalloc(sizeof(struct micscif_rb), GFP_ATOMIC);
+ if (port->dp_in)
+ port->dp_out = kmalloc(sizeof(struct micscif_rb), GFP_ATOMIC);
+ else
+ return -ENOMEM;
+
+ if (port->dp_out) {
+ vcons_mic_header = (struct vcons_mic_header *)mic_hdr;
+ micscif_rb_init(port->dp_in,
+ &vcons_mic_header->o_rd,
+ &vcons_host_header->o_wr,
+ host_buf,
+ vcons_host_header->o_size);
+ micscif_rb_init(port->dp_out, &vcons_host_header->i_rd,
+ &vcons_mic_header->i_wr,
+ mic_buf,
+ vcons_host_header->i_size);
+ wmb();
+ writel(MIC_VCONS_HOST_OPEN, &vcons_mic_header->host_status);
+ } else {
+ kfree(port->dp_in);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static int
+micvcons_readport(micvcons_port_t *port)
+{
+ int num_chars_read = 0, status;
+ static uint32_t prev_mic_magic;
+ struct vcons_buf *vcons_host_header;
+
+ if (!port || !port->dp_vcons)
+ return 0;
+
+ spin_lock_bh(&port->dp_lock);
+ if (!port->dp_tty) {
+ spin_unlock_bh(&port->dp_lock);
+ return 0;
+ }
+
+ vcons_host_header = (struct vcons_buf *)port->dp_vcons->dc_hdr_virt;
+ if ((vcons_host_header->mic_magic != MIC_VCONS_READY) &&
+ (vcons_host_header->mic_magic != MIC_VCONS_SLEEPING)) {
+ if ((vcons_host_header->mic_magic == MIC_VCONS_RB_VER_ERR)
+ && (vcons_host_header->mic_magic != prev_mic_magic)) {
+ printk(KERN_ERR "Card and host ring buffer versions mismatch.");
+ printk(KERN_ERR "Card version: %d, Host version: %d \n",
+ vcons_host_header->mic_rb_ver,
+ vcons_host_header->host_rb_ver);
+ }
+ goto exit;
+ }
+ if (!port->dp_in) {
+ status = micvcons_initport(port);
+ if (status != 0) {
+ spin_unlock_bh(&port->dp_lock);
+ return status;
+ }
+ }
+
+ if (port->dp_in) {
+ if (vcons_host_header->mic_magic == MIC_VCONS_SLEEPING) {
+ /*
+ * If the card is sleeping and there is data in the
+ * buffer, schedule work in a work queue to wake-up
+ * the card and read from the buffer.
+ */
+ if (micscif_rb_count(port->dp_in, 1))
+ queue_work(port->dp_wq,
+ &port->dp_wakeup_read_buf);
+ } else {
+ num_chars_read = micvcons_readchars(port);
+ tty_wakeup(port->dp_tty);
+ }
+ }
+exit:
+ prev_mic_magic = vcons_host_header->mic_magic;
+ spin_unlock_bh(&port->dp_lock);
+ return num_chars_read;
+}
+
+static void
+micvcons_wakeup_readbuf(struct work_struct *work)
+{
+ u8 card_alive = 1;
+ int status;
+ micvcons_port_t *port;
+ struct vcons_buf *vcons_host_header;
+
+ port = container_of(work, micvcons_port_t, dp_wakeup_read_buf);
+
+ vcons_host_header = (struct vcons_buf *)port->dp_vcons->dc_hdr_virt;
+ spin_lock_bh(&port->dp_lock);
+ status = micvcons_resume(get_per_dev_ctx(port->dp_tty->index));
+ if (status == 0) {
+ micvcons_readchars(port);
+ tty_wakeup(port->dp_tty);
+ } else {
+ /* If card can not wakeup, it is dead. */
+ card_alive = 0;
+ }
+ spin_unlock_bh(&port->dp_lock);
+ if (!card_alive)
+ micvcons_del_timer_entry(port);
+}
+
+static void
+micvcons_timeout(unsigned long data)
+{
+ struct list_head *timer_list_ptr = (struct list_head *)data;
+ micvcons_port_t *port;
+ u8 console_active = 0;
+ int num_chars_read = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(port, timer_list_ptr, list_member) {
+ num_chars_read = micvcons_readport(port);
+ if (num_chars_read != 0)
+ console_active = 1;
+ }
+ rcu_read_unlock();
+
+ spin_lock(&timer_list_lock);
+ if (restart_timer_flag == MICVCONS_TIMER_RESTART) {
+ extra_timeout = (console_active ? 0 :
+ extra_timeout + MICVCONS_SHORT_TIMEOUT);
+ extra_timeout = min(extra_timeout, (u16)MICVCONS_MAX_TIMEOUT);
+ mod_timer(&vcons_timer, jiffies +
+ msecs_to_jiffies(MICVCONS_SHORT_TIMEOUT+extra_timeout));
+ }
+ spin_unlock(&timer_list_lock);
+}
+
+static void
+micvcons_throttle(struct tty_struct *tty)
+{
+ micvcons_port_t *port = (micvcons_port_t *)tty->driver_data;
+ port->dp_canread = 0;
+}
+
+static void
+micvcons_unthrottle(struct tty_struct *tty)
+{
+ micvcons_port_t *port = (micvcons_port_t *)tty->driver_data;
+ port->dp_canread = 1;
+}
+
+int micvcons_start(mic_ctx_t *mic_ctx)
+{
+ struct vcons_buf *vcons_host_header;
+ int status;
+ micvcons_port_t *port = mic_ctx->bd_info->bi_port;
+
+ vcons_host_header = (struct vcons_buf *)port->dp_vcons->dc_hdr_virt;
+ if (vcons_host_header->mic_magic == MIC_VCONS_SLEEPING) {
+ status = micvcons_resume(mic_ctx);
+ if (status != 0)
+ return status;
+ }
+ if (vcons_host_header->mic_magic == MIC_VCONS_READY) {
+ if (!port->dp_in) {
+ status = micvcons_initport(port);
+ if (status != 0)
+ return status;
+ }
+ }
+ return 0;
+}
+
+int micvcons_port_write(struct micvcons_port *port, const unsigned char *buf,
+ int count)
+{
+ int ret;
+ uint32_t bytes = 0;
+
+ if (port->dp_out) {
+ bytes = min(count, micscif_rb_space(port->dp_out));
+ ret = micscif_rb_write(port->dp_out, (void *)buf, bytes);
+ BUG_ON(ret);
+ port->dp_bytes += bytes;
+ micscif_rb_commit(port->dp_out);
+ }
+ return bytes;
+}
+
+/**
+ * micvcons_stop - cleans up before a node is rebooted
+ * @ mic_ctx: node to clean up
+ *
+ * Called before rebooting a node, reads remaining characters
+ * from the node's vcons output buffer, resets the input/output
+ * ring buffers so that things work when the node comes up again
+ */
+void
+micvcons_stop(mic_ctx_t *mic_ctx)
+{
+ micvcons_port_t *port;
+ struct vcons_buf *vcons_host_header;
+
+ port = mic_ctx->bd_info->bi_port;
+ micvcons_readport(port);
+ spin_lock_bh(&port->dp_lock);
+ if (port->dp_in) {
+ vcons_host_header = (struct vcons_buf *)port->dp_vcons->dc_hdr_virt;
+ vcons_host_header->mic_magic = 0;
+ kfree(port->dp_in);
+ kfree(port->dp_out);
+ port->dp_in = NULL;
+ port->dp_out = NULL;
+ }
+ spin_unlock_bh(&port->dp_lock);
+}
+
+/**
+ * micvcons_resume - sets the state of a node's console to ready
+ * @ mic_ctx: node to clean up
+ *
+ * @ return: zero if successful.
+ * called before resuming a node from PC6. MUST acquire the spinlock
+ * port->dp_lock with bottom-halves disabled before calling this function.
+ */
+static int
+micvcons_resume(mic_ctx_t *mic_ctx)
+{
+ int status = 0;
+ micvcons_port_t *port;
+ struct vcons_buf *vcons_host_header;
+
+ port = mic_ctx->bd_info->bi_port;
+ vcons_host_header = mic_ctx->bi_vcons.dc_hdr_virt;
+ if (vcons_host_header->mic_magic == MIC_VCONS_SLEEPING) {
+ do {
+ vcons_host_header->mic_magic = MIC_VCONS_WAKINGUP;
+ spin_unlock_bh(&port->dp_lock);
+ status = micscif_connect_node(mic_get_scifnode_id(mic_ctx), false);
+ spin_lock_bh(&port->dp_lock);
+ } while ((status == 0) &&
+ (vcons_host_header->mic_magic == MIC_VCONS_SLEEPING));
+ if (status == 0)
+ vcons_host_header->mic_magic = MIC_VCONS_READY;
+ }
+ return status;
+}
+
+/**
+ * micvcons_pm_disconnect_node - Check if a card can be put to sleep in case
+ * there is any activity on the virtual console. If yes, it also sets the
+ * internal state of a node's console to sleeping.
+ * @ node_bitmask: bits set indicate which cards to check.
+ * Bit-1 for the first, Bit-2 for the second,...
+ * Ignore Bit-0 which indicates host.
+ * @ return: bits set indicating which cards can sleep.
+ * This is called from PM to check if a card can be put to sleep (PC-6 state).
+ * This is called when the node is disconnected from the SCIF network
+ * before putting it into the PC6 state where it should no longer
+ * receive an PCIe transactions until woken up by the host driver.
+ */
+int
+micvcons_pm_disconnect_node(uint8_t *node_bitmask, enum disconn_type type)
+{
+ int err = 0;
+ if ((type == DISCONN_TYPE_POWER_MGMT) && (node_bitmask)) {
+ int i = 0;
+ mic_ctx_t *mic_ctx;
+ micvcons_port_t *port;
+ struct vcons_buf *vcons_host_header;
+
+ for (i = 0; i <= mic_data.dd_numdevs; i++) {
+ if (!get_nodemask_bit(node_bitmask, i))
+ continue;
+
+ if (!(mic_ctx = get_per_dev_ctx(i - 1)))
+ continue;
+
+ port = mic_ctx->bd_info->bi_port;
+ micvcons_readport(port);
+ /*
+ * If this function is called when virtual console is
+ * not active, port->dp_vcons needs to be initialized.
+ */
+ if (!port->dp_vcons)
+ port->dp_vcons = &mic_ctx->bi_vcons;
+
+ vcons_host_header = (struct vcons_buf *)port->dp_vcons->dc_hdr_virt;
+ spin_lock_bh(&port->dp_lock);
+ vcons_host_header->mic_magic = MIC_VCONS_SLEEPING;
+ spin_unlock_bh(&port->dp_lock);
+ }
+ }
+
+ return err;
+}
+
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "micint.h"
+#include "mic_common.h"
+#include <mic/micsboxdefine.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/kernel.h>
+#include "mic/micveth.h"
+
+#define PWR_MGMT_NO_POLL_AFTER_LINKS_UP 1
+
+/*
+ In intr/poll modes, mic_smpt_uninit has already been called before
+ micveth_destroy is called during rmmod. This results in host driver crash. The
+ current workaround is, given the 'legacy' nature of VNET intr/poll modes, to
+ not call mic_ctx_unmap_single() at rmmod. This workaround will result in some
+ unmapped memory and a warn_on from micscif_smpt.c.
+ */
+#define WA_UNMAP_AT_RMMOD 0
+
+static void micveth_clientpoll(struct work_struct *work);
+static void micveth_poll(struct work_struct *work);
+static int micvnet_host_doorbell_intr_handler(mic_ctx_t *mic_ctx, int doorbell);
+static void micvnet_intr_bh_handler(struct work_struct *work);
+void micveth_send_intr(micveth_info_t *veth_info);
+
+micveth_t micveth;
+
+void dump_skb(struct sk_buff *skb, int xmit);
+
+static inline
+mic_ctx_t *veth_to_ctx(micveth_info_t *veth_info)
+{
+ return veth_info->mic_ctx;
+}
+
+static int
+micveth_set_address(struct net_device *dev, void *p)
+{
+ struct sockaddr *sa = p;
+
+ if (!is_valid_ether_addr(sa->sa_data))
+ return -EADDRNOTAVAIL;
+
+ memcpy(dev->dev_addr, sa->sa_data, ETH_ALEN);
+ return 0;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0)
+static void
+micveth_multicast_list(struct net_device *dev)
+{
+}
+#endif
+
+static int
+micveth_deliver(struct sk_buff *skb, struct net_device *dev, micveth_info_t *veth_info)
+{
+ veth_ring_t *ring;
+ ring_queue_t *tx_queue;
+ ring_desc_t *desc;
+ ring_packet_t *packet;
+ int next_tail;
+
+ //dump_skb(skb, 1);
+
+ spin_lock(&veth_info->vi_txlock);
+ ring = &veth_info->vi_ring.ring;
+ tx_queue = &ring->r_tx;
+
+ next_tail = (tx_queue->rq_tail + 1) % tx_queue->rq_length;
+ if (next_tail == tx_queue->rq_head) {
+ // queue_full situation - just drop the packet and let the stack retry
+ spin_unlock(&veth_info->vi_txlock);
+ return 1;
+ }
+
+ desc = &tx_queue->rq_descs[tx_queue->rq_tail];
+ packet = &veth_info->vi_tx_desc[tx_queue->rq_tail];
+ packet->pd_skb = skb;
+ packet->pd_phys = mic_ctx_map_single(veth_to_ctx(veth_info),
+ skb->data, skb->len);
+ packet->pd_length = skb->len;
+ desc->rd_phys = packet->pd_phys;
+ desc->rd_length = skb->len;
+ desc->rd_valid = 1;
+
+ /*
+ * Need a write memory barrier between copying the skb data to
+ * the buffer and updating the tail pointer. NOT an smp_wmb(),
+ * because this memory barrier needs to be done even if there is
+ * a single CPU in the system.
+ */
+ wmb();
+ tx_queue->rq_tail = (tx_queue->rq_tail + 1) % tx_queue->rq_length;
+ spin_unlock(&veth_info->vi_txlock);
+
+ if (mic_vnet_mode == VNET_MODE_INTR) {
+ micveth_send_intr(veth_info);
+ }
+
+ return 0;
+}
+
+static int
+micveth_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ micveth_info_t *veth_info;
+
+ if (be16_to_cpu(skb->protocol) == ETH_P_IPV6) {
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+ }
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+
+ veth_info = dev->ml_priv;
+
+ if (veth_info->vi_state != VETH_STATE_LINKUP) {
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+ }
+
+ if (micveth_deliver(skb, dev, veth_info)) {
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ }
+
+ return NETDEV_TX_OK;
+}
+
+static int
+micveth_change_mtu(struct net_device *dev, int new_mtu)
+{
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+/* Start callback */
+static int
+micveth_start_dev(struct net_device *dev)
+{
+ micveth_info_t *veth_info = dev->ml_priv;
+
+ micveth_start(veth_info->mic_ctx);
+ return 0;
+}
+
+/* Stop callback */
+static int
+micveth_stop_dev(struct net_device *dev)
+{
+ micveth_info_t *veth_info = dev->ml_priv;
+
+ micveth_stop(veth_info->mic_ctx);
+ return 0;
+}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,28)
+static const struct net_device_ops veth_netdev_ops = {
+ .ndo_open = micveth_start_dev,
+ .ndo_stop = micveth_stop_dev,
+ .ndo_start_xmit = micveth_xmit,
+ .ndo_validate_addr = eth_validate_addr,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0)
+ .ndo_set_multicast_list = micveth_multicast_list,
+#endif
+ .ndo_set_mac_address = micveth_set_address,
+ .ndo_change_mtu = micveth_change_mtu,
+};
+#endif
+
+static void
+micveth_setup(struct net_device *dev)
+{
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,28)
+ dev->hard_start_xmit = micveth_xmit;
+ dev->set_multicast_list = micveth_multicast_list;
+ dev->set_mac_address = micveth_set_address;
+#endif
+ ether_setup(dev);
+
+ /* Initialize the device structure. */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,28)
+ dev->netdev_ops = &veth_netdev_ops;
+#endif
+ dev->destructor = free_netdev;
+
+ /* Fill in device structure with ethernet-generic values. */
+ dev->mtu = (MICVETH_MAX_PACKET_SIZE);
+ dev->tx_queue_len = 0;
+ dev->flags &= ~IFF_MULTICAST;
+ random_ether_addr(dev->dev_addr);
+}
+
+static int
+micveth_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
+ }
+ return 0;
+}
+
+static struct rtnl_link_ops micveth_link_ops __read_mostly = {
+ .kind = "micveth",
+ .setup = micveth_setup,
+ .validate = micveth_validate,
+};
+
+static int
+micveth_probe_int(micveth_info_t *veth_info, mic_ctx_t *mic_ctx)
+{
+ struct net_device *dev_veth;
+ ring_queue_t *queue;
+ ring_desc_t *desc;
+ ring_packet_t *packet;
+ int idx;
+ int err = 0;
+
+ veth_info->vi_pdev = mic_ctx->bi_pdev;
+ veth_info->vi_sbox = (uint8_t *)((unsigned long)mic_ctx->mmio.va +
+ HOST_SBOX_BASE_ADDRESS);
+ veth_info->vi_scratch14 = (uint32_t *)((unsigned long)mic_ctx->mmio.va +
+ HOST_SBOX_BASE_ADDRESS + SBOX_SCRATCH14);
+ veth_info->vi_scratch15 = (uint32_t *)((unsigned long)mic_ctx->mmio.va +
+ HOST_SBOX_BASE_ADDRESS + SBOX_SCRATCH15);
+ veth_info->mic_ctx = mic_ctx;
+ mic_ctx->bi_vethinfo = (void *)veth_info;
+
+ spin_lock_init(&veth_info->vi_txlock);
+ spin_lock_init(&veth_info->vi_rxlock);
+
+ if (mic_vnet_mode == VNET_MODE_POLL)
+ INIT_DELAYED_WORK(&veth_info->vi_poll, micveth_poll);
+
+ // Set the current sk_buff allocation size
+ veth_info->vi_skb_mtu = MICVETH_MAX_PACKET_SIZE + 32;
+
+ // Get the physical memory address for the ring descriptors
+ veth_info->vi_ring.phys = mic_ctx_map_single(veth_to_ctx(veth_info), &veth_info->vi_ring.ring,
+ sizeof(veth_ring_t));
+ veth_info->vi_ring.length = sizeof(veth_ring_t);
+
+ queue = &veth_info->vi_ring.ring.r_tx;
+ queue->rq_head = 0;
+ queue->rq_tail = 0;
+ queue->rq_length = MICVETH_TRANSFER_FIFO_SIZE;
+
+ veth_info->vi_pend = 0;
+
+ packet = &veth_info->vi_tx_desc[0];
+ for (idx = 0; idx < queue->rq_length; idx++) {
+ desc = &queue->rq_descs[idx];
+ packet[idx].pd_skb = NULL;
+ packet[idx].pd_phys = 0;
+ packet[idx].pd_length = 0;
+
+ desc->rd_phys = 0;
+ desc->rd_length = 0;
+ desc->rd_valid = 0;
+ }
+
+ // This is the recieve end.
+ queue = &veth_info->vi_ring.ring.r_rx;
+ queue->rq_head = 0;
+ queue->rq_tail = 0;
+ queue->rq_length = MICVETH_TRANSFER_FIFO_SIZE;
+
+ packet = &veth_info->vi_rx_desc[0];
+ for (idx = 0; idx < queue->rq_length; idx++) {
+ desc = &queue->rq_descs[idx];
+ if (!(packet[idx].pd_skb = dev_alloc_skb(veth_info->vi_skb_mtu)))
+ return -ENOMEM;
+ packet[idx].pd_phys = mic_ctx_map_single(veth_to_ctx(veth_info), packet[idx].pd_skb->data,
+ veth_info->vi_skb_mtu);
+ packet[idx].pd_length = veth_info->vi_skb_mtu;
+
+ desc->rd_phys = packet[idx].pd_phys;
+ desc->rd_length = packet[idx].pd_length;
+ desc->rd_valid = 1;
+ }
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 17, 0)
+ if ((dev_veth = alloc_netdev(sizeof(micveth_info_t), "mic%d", micveth_setup)) == NULL) {
+#else
+ if ((dev_veth = alloc_netdev(sizeof(micveth_info_t), "mic%d", NET_NAME_UNKNOWN, micveth_setup)) == NULL) {
+#endif
+ return -ENOMEM;
+ }
+
+ veth_info->vi_netdev = dev_veth;
+ dev_veth->ml_priv = veth_info;
+ dev_veth->rtnl_link_ops = &micveth_link_ops;
+
+ if ((err = register_netdev(dev_veth)) < 0) {
+ printk("register netdev failed %d\n", err);
+ free_netdev(dev_veth);
+ return err;
+ }
+
+ veth_info->vi_state = VETH_STATE_INITIALIZED;
+ return 0;
+}
+
+static ssize_t show_veth(struct device *dev,
+ struct device_attribute *attr, char *buf);
+DEVICE_ATTR(veth, S_IRUGO, show_veth, NULL);
+
+static int
+micveth_init_int(int num_bds, struct device *dev)
+{
+ int bd;
+ int err = 0;
+
+ micveth.lv_num_interfaces = num_bds;
+ micveth.lv_num_clients = num_bds;
+ micveth.lv_active_clients = 0;
+ micveth.lv_num_links_remaining = num_bds;
+
+ BUG_ON(rtnl_link_register(&micveth_link_ops));
+
+ // Allocate space for the control of each device in the system.
+ micveth.lv_info = kmalloc(sizeof(micveth_info_t) * num_bds, GFP_KERNEL);
+
+ // Initialize state mutex. Overloaded use for several fields.
+ mutex_init(&micveth.lv_state_mutex);
+
+ // Setup of timer for probeing active mic clients. When the total active board
+ // count is zero the poll is not running.
+ micveth.lv_pollstate = CLIENT_POLL_STOPPED;
+ INIT_DELAYED_WORK(&micveth.lv_poll, micveth_clientpoll);
+ init_waitqueue_head(&micveth.lv_wq);
+
+ // Init each of the existing boards.
+ for (bd = 0; bd < num_bds; bd++) {
+ micveth_probe_int(&micveth.lv_info[bd], &mic_data.dd_bi[bd]->bi_ctx);
+ }
+
+ err = device_create_file(dev, &dev_attr_veth);
+ return err;
+}
+
+static void
+micveth_exit_int(void)
+{
+ mic_ctx_t *mic_ctx = kmalloc(sizeof(mic_ctx_t), GFP_KERNEL);
+ micveth_info_t *veth_info;
+ ring_packet_t *packet;
+ int bd;
+ int idx;
+
+ rtnl_link_unregister(&micveth_link_ops);
+
+ for (bd = 0; bd < micveth.lv_num_clients; bd++) {
+ veth_info = &micveth.lv_info[bd];
+
+ /* veth_info->mic_ctx == mic_data.dd_bi[bd] is freed in
+ remove so cannot be used in exit */
+ mic_ctx->bi_vethinfo = veth_info;
+ micveth_stop(mic_ctx);
+
+#if WA_UNMAP_AT_RMMOD
+ mic_ctx_unmap_single(veth_to_ctx(veth_info), veth_info->vi_ring.phys,
+ sizeof(veth_ring_t));
+#endif
+
+ for (idx = 0; idx < veth_info->vi_ring.ring.r_tx.rq_length; idx++) {
+ packet = &veth_info->vi_tx_desc[idx];
+ if (packet->pd_skb != NULL) {
+#if WA_UNMAP_AT_RMMOD
+ mic_ctx_unmap_single(veth_to_ctx(veth_info), packet->pd_phys,
+ packet->pd_skb->len);
+#endif
+ kfree_skb(packet->pd_skb);
+ }
+ }
+
+ for (idx = 0; idx < veth_info->vi_ring.ring.r_rx.rq_length; idx++) {
+ packet = &veth_info->vi_rx_desc[idx];
+#if WA_UNMAP_AT_RMMOD
+ mic_ctx_unmap_single(veth_to_ctx(veth_info), packet->pd_phys, packet->pd_skb->len);
+#endif
+ kfree_skb(packet->pd_skb);
+ }
+ }
+
+ kfree(mic_ctx);
+ kfree(micveth.lv_info);
+}
+
+static int
+micveth_start_int(mic_ctx_t *mic_ctx)
+{
+ micveth_info_t *veth_info = &micveth.lv_info[mic_ctx->bi_id];
+
+ // Eventuall (very soon) most of the descriptor allocation for a board will be done here
+ if (veth_info->vi_state != VETH_STATE_INITIALIZED)
+ return 0;
+
+ mutex_lock(&micveth.lv_state_mutex);
+
+ if (micveth.lv_pollstate == CLIENT_POLL_STOPPED) {
+ schedule_delayed_work(&micveth.lv_poll, msecs_to_jiffies(MICVETH_CLIENT_TIMER_DELAY));
+ micveth.lv_pollstate = CLIENT_POLL_RUNNING;
+ }
+
+ micveth.lv_active_clients++;
+ mutex_unlock(&micveth.lv_state_mutex);
+
+ veth_info->vi_pend = 0;
+
+ veth_info->vi_ring.ring.r_tx.rq_head = 0;
+ veth_info->vi_ring.ring.r_tx.rq_tail = 0;
+
+ veth_info->vi_ring.ring.r_rx.rq_head = 0;
+ veth_info->vi_ring.ring.r_rx.rq_tail = 0;
+ veth_info->vi_state = VETH_STATE_LINKDOWN;
+
+ if (mic_vnet_mode == VNET_MODE_INTR) {
+ snprintf(veth_info->vi_wqname, sizeof(veth_info->vi_wqname),
+ "VNET INTR %d\n", mic_ctx->bi_id);
+ veth_info->vi_wq = create_singlethread_workqueue(veth_info->vi_wqname);
+ INIT_WORK(&veth_info->vi_bh, micvnet_intr_bh_handler);
+
+ // Install interrupt handler on doorbell 3
+ mic_reg_irqhandler(mic_ctx, 3, "Host DoorBell 3",
+ micvnet_host_doorbell_intr_handler);
+ }
+
+ return 0;
+}
+
+static void
+micveth_stop_int(mic_ctx_t *mic_ctx)
+{
+ micveth_info_t *veth_info = (micveth_info_t *)(mic_ctx->bi_vethinfo);
+
+ if (veth_info->vi_state == VETH_STATE_INITIALIZED)
+ return;
+
+ mutex_lock(&micveth.lv_state_mutex);
+
+ if (mic_vnet_mode == VNET_MODE_INTR) {
+ // Remove interrupt handler on doorbell 3
+ mic_unreg_irqhandler(mic_ctx, 3, "Host DoorBell 3");
+
+ destroy_workqueue(veth_info->vi_wq);
+ }
+
+ micveth.lv_active_clients--;
+ veth_info->vi_state = VETH_STATE_INITIALIZED;
+
+ if (micveth.lv_active_clients) {
+ mutex_unlock(&micveth.lv_state_mutex);
+ return;
+ }
+
+ micveth.lv_num_links_remaining = micveth.lv_num_clients;
+
+#if PWR_MGMT_NO_POLL_AFTER_LINKS_UP
+ micveth.lv_pollstate = CLIENT_POLL_STOPPED;
+ mutex_unlock(&micveth.lv_state_mutex);
+#else
+ micveth.lv_pollstate = CLIENT_POLL_STOPPING;
+ mutex_unlock(&micveth.lv_state_mutex);
+ wait_event(micveth.lv_wq, micveth.lv_pollstate == CLIENT_POLL_STOPPED);
+#endif
+}
+
+#define NO_SRATCHREGREAD_AFTER_CONNECT 1
+static void
+micveth_clientpoll(struct work_struct *work)
+{
+ micveth_info_t *veth_info;
+ uint32_t transRingHi;
+ uint32_t transRingLo;
+ uint32_t scratch14 = 0;
+ uint32_t scratch15 = 0;
+ int bd;
+ static int enter = 0;
+
+ if (enter == 0)
+ {
+ printk("micveth is polling\n");
+ enter = 1;
+ }
+
+ mutex_lock(&micveth.lv_state_mutex);
+ if (micveth.lv_pollstate == CLIENT_POLL_STOPPING) {
+ micveth.lv_pollstate = CLIENT_POLL_STOPPED;
+ mutex_unlock(&micveth.lv_state_mutex);
+ wake_up(&micveth.lv_wq);
+ return;
+ }
+
+ // Check for state changes for each board in the system
+ for (bd = 0; bd < micveth.lv_num_clients; bd++) {
+ veth_info = &micveth.lv_info[bd];
+
+ // Do not poll boards that have not had the interface started.
+ if (veth_info->vi_state == VETH_STATE_INITIALIZED) {
+ break;
+ }
+
+#ifdef NO_SRATCHREGREAD_AFTER_CONNECT
+ if(veth_info->vi_state != VETH_STATE_LINKUP) {
+#endif
+ scratch14 = readl(veth_info->vi_scratch14);
+ scratch15 = readl(veth_info->vi_scratch15);
+#ifdef NO_SRATCHREGREAD_AFTER_CONNECT
+ }
+#endif
+
+ if (veth_info->vi_state == VETH_STATE_LINKUP) {
+ if (scratch14 == MICVETH_LINK_DOWN_MAGIC) {
+ veth_info->vi_state = VETH_STATE_LINKDOWN;
+ }
+ } else if (veth_info->vi_state == VETH_STATE_LINKDOWN) {
+ if (scratch14 == MICVETH_LINK_UP_MAGIC) {
+ // Write the transfer ring address.
+ transRingHi = (uint32_t)(veth_info->vi_ring.phys >> 32);
+ transRingLo = (uint32_t)(veth_info->vi_ring.phys & 0xffffffff);
+
+ writel(transRingLo, veth_info->vi_scratch14);
+ writel(transRingHi, veth_info->vi_scratch15);
+
+ veth_info->vi_state = VETH_STATE_LINKUP;
+ printk("MIC virtual ethernet up for board %d\n", bd);
+#ifdef MIC_IS_EMULATION
+ printk("Card wrote Magic: It must be UP!\n");
+#endif
+
+ if (mic_vnet_mode == VNET_MODE_POLL) {
+ schedule_delayed_work(&veth_info->vi_poll,
+ msecs_to_jiffies(MICVETH_POLL_TIMER_DELAY));
+ }
+
+ micveth.lv_num_links_remaining--;
+ }
+#ifdef MIC_IS_EMULATION
+ else if (scratch14) {
+ printk("---> 0x%x \n", scratch14);
+ writel(0x0, veth_info->vi_scratch14);
+ }
+#endif
+ }
+ }
+
+ mutex_unlock(&micveth.lv_state_mutex);
+
+#if PWR_MGMT_NO_POLL_AFTER_LINKS_UP
+ if (micveth.lv_num_links_remaining)
+#endif
+ schedule_delayed_work(&micveth.lv_poll, msecs_to_jiffies(MICVETH_CLIENT_TIMER_DELAY));
+}
+
+static int
+micvnet_host_doorbell_intr_handler(mic_ctx_t *mic_ctx, int doorbell)
+{
+ micveth_info_t *veth_info;
+ veth_info = &micveth.lv_info[mic_ctx->bi_id];
+ queue_work(veth_info->vi_wq, &veth_info->vi_bh);
+ return 0;
+}
+
+void
+micveth_send_intr(micveth_info_t *veth_info)
+{
+ mic_ctx_t *mic_ctx = veth_info->mic_ctx;
+ mic_send_vnet_intr(mic_ctx);
+}
+
+void
+_micveth_process_descriptors(micveth_info_t *veth_info)
+{
+ veth_ring_t *ring = &veth_info->vi_ring.ring;
+ ring_queue_t *rx_queue = &ring->r_rx;
+ ring_queue_t *tx_queue = &ring->r_tx;
+ ring_desc_t *desc;
+ ring_packet_t *packet;
+ struct sk_buff *skb;
+ int receive_skb = 0;
+ int err;
+
+ if (veth_info->vi_state != VETH_STATE_LINKUP) {
+ return;
+ }
+
+ spin_lock_bh(&veth_info->vi_rxlock);
+
+ while (rx_queue->rq_head != rx_queue->rq_tail) {
+ desc = &rx_queue->rq_descs[rx_queue->rq_head];
+
+ veth_info->vi_netdev->stats.rx_packets++;
+ veth_info->vi_netdev->stats.rx_bytes += desc->rd_length;
+
+ packet = &veth_info->vi_rx_desc[rx_queue->rq_head];
+
+ skb = packet->pd_skb;
+ skb_put(skb, desc->rd_length);
+
+ //dump_skb(skb, 0);
+ mic_ctx_unmap_single(veth_to_ctx(veth_info), packet->pd_phys, veth_info->vi_skb_mtu);
+ packet->pd_skb = dev_alloc_skb(veth_info->vi_skb_mtu);
+ packet->pd_phys = mic_ctx_map_single(veth_to_ctx(veth_info), packet->pd_skb->data,
+ veth_info->vi_skb_mtu);
+ desc->rd_phys = packet->pd_phys;
+ desc->rd_length = packet->pd_length;
+
+ skb->dev = veth_info->vi_netdev;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ err = netif_receive_skb(skb);
+ /*
+ * Need a general memory barrier between copying the data from
+ * the buffer and updating the head pointer. It's the general
+ * mb() because we're ordering the read of the data with the write.
+ */
+ mb();
+ rx_queue->rq_head = (rx_queue->rq_head + 1) % rx_queue->rq_length;
+ receive_skb++;
+ }
+
+ /* Send intr to TX so that pending SKB's can be freed */
+ if (receive_skb && mic_vnet_mode == VNET_MODE_INTR) {
+ micveth_send_intr(veth_info);
+ }
+
+ spin_unlock_bh(&veth_info->vi_rxlock);
+
+ spin_lock_bh(&veth_info->vi_txlock);
+
+ // Also handle completed tx requests
+ while (veth_info->vi_pend != tx_queue->rq_head) {
+ desc = &tx_queue->rq_descs[veth_info->vi_pend];
+ packet = &veth_info->vi_tx_desc[veth_info->vi_pend];
+
+ skb = packet->pd_skb;
+ packet->pd_skb = NULL;
+
+ mic_ctx_unmap_single(veth_to_ctx(veth_info), packet->pd_phys, skb->len);
+ packet->pd_phys = 0;
+
+ kfree_skb(skb);
+
+ veth_info->vi_pend = (veth_info->vi_pend + 1) % tx_queue->rq_length;
+ }
+
+ spin_unlock_bh(&veth_info->vi_txlock);
+
+ if (mic_vnet_mode == VNET_MODE_POLL) {
+ schedule_delayed_work(&veth_info->vi_poll, msecs_to_jiffies(MICVETH_POLL_TIMER_DELAY));
+ }
+}
+
+static void
+micvnet_intr_bh_handler(struct work_struct *work)
+{
+ micveth_info_t *veth_info = container_of(work, micveth_info_t, vi_bh);
+ _micveth_process_descriptors(veth_info);
+}
+
+static void
+micveth_poll(struct work_struct *work)
+{
+ micveth_info_t *veth_info = container_of(work, micveth_info_t, vi_poll.work);
+
+ _micveth_process_descriptors(veth_info);
+}
+
+static ssize_t
+show_veth(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ micveth.lv_pollstate == CLIENT_POLL_RUNNING ?
+ "running" : "stopped");
+}
+
+/*
+ VNET driver public API. These are simply wrappers which either invoke the old
+ interrupt/poll mode functions or the new DMA mode functions. These are temporary and
+ will be phased out with the old interrupt/poll mode so only the DMA mode will be around
+ eventually.
+ */
+int __init
+micveth_init(struct device *dev)
+{
+ printk("vnet: mode: %s, buffers: %d\n",
+ mic_vnet_modes[mic_vnet_mode], vnet_num_buffers);
+
+ if (mic_vnet_mode == VNET_MODE_DMA)
+ return micvnet_init(dev);
+ /* Intr/poll modes use micveth_init_legacy */
+ return 0;
+}
+
+int __init
+micveth_init_legacy(int num_bds, struct device *dev)
+{
+ if (mic_vnet_mode != VNET_MODE_DMA)
+ return micveth_init_int(num_bds, dev);
+ /* DMA mode uses micveth_init */
+ return 0;
+}
+
+void
+micveth_exit(void)
+{
+ if (mic_vnet_mode == VNET_MODE_DMA)
+ micvnet_exit();
+ else
+ micveth_exit_int();
+}
+
+int
+micveth_probe(mic_ctx_t *mic_ctx)
+{
+ if (mic_vnet_mode == VNET_MODE_DMA)
+ return micvnet_probe(mic_ctx);
+ /* No support for micveth_probe in legacy intr/poll modes */
+ return 0;
+}
+
+void
+micveth_remove(mic_ctx_t *mic_ctx)
+{
+ if (mic_vnet_mode == VNET_MODE_DMA)
+ micvnet_remove(mic_ctx);
+ /* No support for micveth_remove in legacy intr/poll modes */
+}
+
+int
+micveth_start(mic_ctx_t *mic_ctx)
+{
+ micveth_info_t *veth_info = mic_ctx->bi_vethinfo;
+ int err;
+
+ if (mic_vnet_mode == VNET_MODE_DMA)
+ err = micvnet_start(mic_ctx);
+ else
+ err = micveth_start_int(mic_ctx);
+
+ if (!err)
+ netif_carrier_on(veth_info->vi_netdev);
+
+ return err;
+}
+
+void
+micveth_stop(mic_ctx_t *mic_ctx)
+{
+ micveth_info_t *veth_info = mic_ctx->bi_vethinfo;
+
+ if (mic_vnet_mode == VNET_MODE_DMA)
+ micvnet_stop(mic_ctx);
+ else
+ micveth_stop_int(mic_ctx);
+
+ if (veth_info)
+ netif_carrier_off(veth_info->vi_netdev);
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "micint.h"
+
+bool mic_psmi_enable = 0;
+
+extern struct bin_attribute mic_psmi_ptes_attr;
+
+static __always_inline void
+mic_psmi_free_pte(mic_ctx_t *mic_ctx, int i)
+{
+ pci_unmap_single(mic_ctx->bi_pdev,
+ mic_ctx->bi_psmi.dma_tbl[i].pa, MIC_PSMI_PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+ free_pages(mic_ctx->bi_psmi.va_tbl[i - 1].pa, MIC_PSMI_PAGE_ORDER);
+}
+
+static int mic_psmi_alloc_buffer(mic_ctx_t *mic_ctx)
+{
+ int i, j, ret;
+ void *va;
+ dma_addr_t dma_hndl;
+ struct mic_psmi_ctx *psmi_ctx = &mic_ctx->bi_psmi;
+
+ /* allocate psmi page tables */
+ psmi_ctx->nr_dma_pages =
+ ALIGN(psmi_ctx->dma_mem_size,
+ MIC_PSMI_PAGE_SIZE) / MIC_PSMI_PAGE_SIZE;
+ if ((psmi_ctx->va_tbl =
+ kmalloc(psmi_ctx->nr_dma_pages *
+ sizeof(struct mic_psmi_pte), GFP_KERNEL)) == NULL) {
+ printk("mic: psmi va table alloc failed\n");
+ return -ENOMEM;
+ }
+ psmi_ctx->dma_tbl_size =
+ (psmi_ctx->nr_dma_pages + 2) * sizeof(struct mic_psmi_pte);
+ if ((psmi_ctx->dma_tbl =
+ kmalloc(psmi_ctx->dma_tbl_size, GFP_KERNEL)) == NULL) {
+ printk("mic: psmi dma table alloc failed\n");
+ ret = -ENOMEM;
+ goto free_va_tbl;
+ }
+ psmi_ctx->dma_tbl_hndl =
+ pci_map_single(mic_ctx->bi_pdev,
+ psmi_ctx->dma_tbl, psmi_ctx->dma_tbl_size, PCI_DMA_BIDIRECTIONAL);
+ if (pci_dma_mapping_error(mic_ctx->bi_pdev,
+ psmi_ctx->dma_tbl_hndl)) {
+ printk("mic: psmi dma table mapping failed\n");
+ ret = -ENOMEM;
+ goto free_dma_tbl;
+ }
+
+ /* allocate psmi pages */
+ for (i = 0; i < psmi_ctx->nr_dma_pages; i++) {
+ if ((va = (void *)__get_free_pages(
+ GFP_KERNEL | __GFP_HIGHMEM,
+ MIC_PSMI_PAGE_ORDER)) == NULL) {
+ printk("mic: psmi page alloc failed: %d\n", i);
+ ret = -ENOMEM;
+ goto free_ptes;
+ }
+ memset(va, 0, MIC_PSMI_PAGE_SIZE);
+ dma_hndl = pci_map_single(mic_ctx->bi_pdev, va,
+ MIC_PSMI_PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+ if (pci_dma_mapping_error(mic_ctx->bi_pdev, dma_hndl)) {
+ printk("mic: psmi page mapping failed: %d\n", i);
+ free_pages((unsigned long)va, MIC_PSMI_PAGE_ORDER);
+ ret = -ENOMEM;
+ goto free_ptes;
+ }
+ psmi_ctx->dma_tbl[i + 1].pa = dma_hndl;
+ psmi_ctx->va_tbl[i].pa = (uint64_t)va;
+ }
+ psmi_ctx->dma_tbl[0].pa = MIC_PSMI_SIGNATURE;
+ psmi_ctx->dma_tbl[psmi_ctx->nr_dma_pages + 1].pa = MIC_PSMI_SIGNATURE;
+ printk("mic: psmi #%d, %ld bytes, "
+ "dma_tbl va=0x%lx hndl=0x%lx\n", mic_ctx->bi_id + 1,
+ (unsigned long)psmi_ctx->dma_mem_size,
+ (unsigned long)psmi_ctx->dma_tbl,
+ (unsigned long)psmi_ctx->dma_tbl_hndl);
+ return 0;
+free_ptes:
+ for (j = 1; j < i; j++)
+ mic_psmi_free_pte(mic_ctx, j);
+ pci_unmap_single(mic_ctx->bi_pdev,
+ psmi_ctx->dma_tbl_hndl, psmi_ctx->dma_tbl_size, PCI_DMA_BIDIRECTIONAL);
+free_dma_tbl:
+ kfree(psmi_ctx->dma_tbl);
+ psmi_ctx->dma_tbl = NULL;
+free_va_tbl:
+ kfree(psmi_ctx->va_tbl);
+ psmi_ctx->va_tbl = NULL;
+ return ret;
+}
+
+static void mic_psmi_free_buffer(mic_ctx_t *mic_ctx)
+{
+ struct mic_psmi_ctx *psmi_ctx = &mic_ctx->bi_psmi;
+ int i;
+
+ for (i = 1; i <= psmi_ctx->nr_dma_pages; i++)
+ mic_psmi_free_pte(mic_ctx, i);
+ pci_unmap_single(mic_ctx->bi_pdev,
+ psmi_ctx->dma_tbl_hndl, psmi_ctx->dma_tbl_size, PCI_DMA_BIDIRECTIONAL);
+ kfree(psmi_ctx->dma_tbl);
+ psmi_ctx->dma_tbl = NULL;
+ kfree(psmi_ctx->va_tbl);
+ psmi_ctx->va_tbl = NULL;
+ printk("mic: psmi freed %ld bytes for board #%d\n",
+ (unsigned long)psmi_ctx->dma_mem_size, mic_ctx->bi_id + 1);
+}
+
+extern int usagemode_param;
+
+int mic_psmi_init(mic_ctx_t *mic_ctx)
+{
+ int ret;
+ int status = 0;
+ uint32_t scratch0;
+ struct mic_psmi_ctx * psmi_ctx = &mic_ctx->bi_psmi;
+
+ psmi_ctx->enabled = 0;
+ /* Only initialize psmi for the first board */
+ if (!mic_psmi_enable || mic_ctx->bi_id)
+ return 0;
+ if(!(scratch0 = SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH0))) {
+ status = wait_for_bootstrap(mic_ctx->mmio.va);
+ scratch0 = SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH0);
+ }
+ /* Memory size includes 512K reserved for VGA & GTT table */
+ psmi_ctx->dma_mem_size =
+ SCRATCH0_MEM_SIZE_KB(scratch0) * ((1) * 1024) +
+ MIC_PSMI_PAGE_SIZE;
+ if (USAGE_MODE_NORMAL == usagemode_param) {
+ if ((ret = mic_psmi_alloc_buffer(mic_ctx)))
+ return ret;
+ mic_psmi_ptes_attr.size = psmi_ctx->dma_tbl_size;
+ }
+ psmi_ctx->enabled = 1;
+ return 0;
+}
+
+void mic_psmi_uninit(mic_ctx_t *mic_ctx)
+{
+ struct mic_psmi_ctx * psmi_ctx = &mic_ctx->bi_psmi;
+
+ if (!psmi_ctx->enabled)
+ return;
+ if (USAGE_MODE_NORMAL == usagemode_param)
+ mic_psmi_free_buffer(mic_ctx);
+ psmi_ctx->enabled = 0;
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "mic_common.h"
+#include "scif.h"
+#include "mic/micscif.h"
+#include "mic/mic_pm.h"
+#include "mic/micveth.h"
+
+extern int set_host_state(mic_ctx_t *mic_ctx, PM_IDLE_STATE state);
+extern int pc6_entry_start(mic_ctx_t *mic_ctx);
+
+/* Function that decrements the count of number of PM clients connected
+ * to the host.
+ */
+void
+micpm_decrement_clients(void)
+{
+ if(unlikely(atomic_dec_return(&mic_data.dd_pm.connected_clients) < 0)) {
+ PM_DEBUG("connected_clients is negative (%d)\n",
+ atomic_read(&mic_data.dd_pm.connected_clients));
+ }
+ return;
+}
+
+static char *pm_message_types[PM_MESSAGE_MAX+1] = {"PM_MESSAGE_PC3READY",
+ "PM_MESSAGE_OPEN",
+ "PM_MESSAGE_OPEN_ACK",
+ "PM_MESSAGE_CLOSE",
+ "PM_MESSAGE_CLOSE_ACK",
+ "PM_MESSAGE_TEST",
+ "PM_MESSAGE_MAX"};
+void
+micpm_display_message(mic_ctx_t *mic_ctx, void *header, void *msg, const char* label) {
+ pm_msg_header *header_ref;
+ int msg_len;
+ int i=0;
+ char *payload;
+ scif_epd_t epd = mic_ctx->micpm_ctx.pm_epd;
+ header_ref = (pm_msg_header *)header;
+ msg_len = header_ref->len;
+
+ if(!epd)
+ return;
+
+ if(0 <= header_ref->opcode && header_ref->opcode < PM_MESSAGE_MAX) {
+ if(strcmp(label,"SENT")==0) {
+ printk("%s: Msg type %s, SrcNode:SrcPort %d:%d, DestNode:DestPort %d:%d", label,
+ pm_message_types[header_ref->opcode], epd->port.node, epd->port.port,
+ epd->peer.node, epd->peer.port);
+ }
+ else
+ printk("%s: Msg type %s, DestNode:DestPort %d:%d, SrcNode:SrcPort %d:%d", label,
+ pm_message_types[header_ref->opcode], epd->port.node, epd->port.port,
+ epd->peer.node, epd->peer.port);
+ }
+
+
+ if(msg != NULL) {
+ payload = (char *)msg;
+ printk(" Payload");
+ for(i=0;i<msg_len;i++){
+ printk("0x%02x:", payload[i]);
+ }
+ }
+}
+
+int micpm_update_pc6(mic_ctx_t *mic_ctx, bool set)
+{
+
+ int err = 0;
+ if (mic_ctx->micpm_ctx.pm_options.pc6_enabled) {
+ if (set && !mic_ctx->micpm_ctx.pc6_enabled) {
+ mic_ctx->micpm_ctx.pc6_enabled = set;
+ queue_delayed_work(mic_ctx->micpm_ctx.pc6_entry_wq,
+ &mic_ctx->micpm_ctx.pc6_entry_work,
+ mic_ctx->micpm_ctx.pc6_timeout*HZ);
+ }
+ if (set == false) {
+ mic_ctx->micpm_ctx.pc6_enabled = set;
+ micpm_get_reference(mic_ctx, true);
+ micpm_put_reference(mic_ctx);
+ }
+ } else {
+ if (set)
+ err = -EINVAL;
+ else
+ mic_ctx->micpm_ctx.pc6_enabled = set;
+ }
+ return err;
+}
+
+int micpm_update_pc3(mic_ctx_t *mic_ctx, bool set)
+{
+ int err = 0;
+ if (mic_ctx->micpm_ctx.pm_options.pc3_enabled) {
+ if (set) {
+ mic_ctx->micpm_ctx.pc3_enabled = set;
+ } else {
+ mic_ctx->micpm_ctx.pc3_enabled = set;
+ micpm_get_reference(mic_ctx, true);
+ micpm_put_reference(mic_ctx);
+ }
+ } else {
+ if (set)
+ err = -EINVAL;
+ else
+ mic_ctx->micpm_ctx.pc3_enabled = set;
+ }
+ return err;
+}
+
+/*
+ * Wraper to scif_send that takes in the buffer to be sent
+ * as input.
+ */
+int
+mic_pm_send(mic_ctx_t *mic_ctx, void *msg, uint32_t len)
+{
+ int err;
+ scif_epd_t epd;
+
+ if(mic_ctx == NULL) {
+ PM_DEBUG("Mic context not Initialized\n");
+ return -EINVAL;
+ }
+
+ if((msg == NULL) || (len == 0)) {
+ PM_DEBUG("Invalid Parameters\n");
+ return -EINVAL;
+ }
+
+ epd = mic_ctx->micpm_ctx.pm_epd;
+ if(epd == NULL) {
+ PM_DEBUG("Scif Endpoint Undefined\n");
+ return -EINVAL;
+ }
+
+ if ((mic_ctx->micpm_ctx.con_state != PM_CONNECTING) &&
+ (mic_ctx->micpm_ctx.con_state != PM_CONNECTED)) {
+ PM_DEBUG("Endpoint not in connected state\n");
+ return -EINVAL;
+ }
+
+ err = scif_send(epd, msg, len, PM_SEND_MODE);
+ /*scif_send returns the number of bytes returned on success */
+ if(err <= 0) {
+ PM_DEBUG("scif_send to node: %d port: %d failed with error %d\n",
+ epd->peer.node, epd->peer.port, err);
+ } else {
+ PM_DEBUG("Bytes sent = %d\n",err);
+ err = 0;
+ }
+
+ return err;
+}
+
+/*
+ * Wrapper to scif_recv.
+ */
+int
+mic_pm_recv(mic_ctx_t *mic_ctx, void *msg, uint32_t len)
+{
+ int err;
+ scif_epd_t epd;
+
+ if(mic_ctx == NULL) {
+ PM_DEBUG("Mic context not Initialized\n");
+ return -EINVAL;
+ }
+
+ if((msg == NULL) || (len == 0)) {
+ PM_DEBUG("Invalid Parameters\n");
+ return -EINVAL;
+ }
+
+ epd = mic_ctx->micpm_ctx.pm_epd;
+ if(epd == NULL) {
+ PM_DEBUG("Scif Endpoint Undefined\n");
+ return -EINVAL;
+ }
+
+ if ((mic_ctx->micpm_ctx.con_state != PM_CONNECTING) &&
+ (mic_ctx->micpm_ctx.con_state != PM_CONNECTED)) {
+ PM_DEBUG("Endpoint not in connected state\n");
+ return -EINVAL;
+ }
+
+ err = scif_recv(epd, msg, len, PM_RECV_MODE);
+
+ if(err <= 0) {
+ pr_debug("scif_recv failed with error %d\n", err);
+ if(err == 0) {
+ /*0 bytes were sent */
+ err = -ENXIO;
+ }
+ } else {
+ PM_DEBUG("Bytes received = %d\n",err);
+ err = 0;
+ }
+ return err;
+}
+
+/*
+ * Function to send a Power Management message over scif. Gets the message type
+ * as input and builds a message header. It then creates a single message buffer
+ * with this header and body and sends it to the receiving node.
+ */
+int
+mic_pm_send_msg(mic_ctx_t *mic_ctx, PM_MESSAGE type, void *msg, uint32_t len)
+{
+ pm_msg_header header;
+ char *send_msg = NULL;
+ int err = 0;
+
+ header.opcode = type;
+ header.len = len;
+
+ send_msg = kmalloc(len + sizeof(pm_msg_header), GFP_KERNEL);
+ if(send_msg == NULL) {
+ PM_DEBUG("error allocating memory");
+ err = -ENOMEM;
+ return err;
+ }
+ memcpy(send_msg , &header, sizeof(pm_msg_header));
+ if((len != 0) && (msg != NULL)) {
+ memcpy((send_msg + sizeof(pm_msg_header)), msg, len);
+ }
+
+ if(mic_data.dd_pm.enable_pm_logging) {
+ if((len != 0) && (msg != NULL))
+ micpm_display_message(mic_ctx,send_msg,send_msg+sizeof(pm_msg_header),"SENT");
+ else
+ micpm_display_message(mic_ctx,send_msg,NULL,"SENT");
+ }
+ err = mic_pm_send(mic_ctx, send_msg, len + sizeof(pm_msg_header));
+ kfree(send_msg);
+ return err;
+}
+
+/*
+ * Handler invoked when receiving a PC3 ready message.
+ */
+int
+handle_pc3_ready(mic_ctx_t *mic_ctx)
+{
+ int err = 0;
+ PM_ENTRY;
+ err = pm_pc3_entry(mic_ctx);
+ PM_EXIT;
+ return err;
+}
+
+/*
+ * Handler invoked when receiving the latency response message
+ */
+int
+handle_open_ack(mic_ctx_t *mic_ctx, pm_msg_pm_options *msg)
+{
+ int err = 0;
+ PM_ENTRY;
+
+ if ((mic_ctx == NULL) || (msg == NULL)) {
+ err = EINVAL;
+ goto inval;
+ }
+
+ if ((msg->version.major_version != PM_MAJOR_VERSION) ||
+ (msg->version.minor_version != PM_MINOR_VERSION)) {
+ printk(KERN_ERR "PM Driver version mismatch. "
+ "Expected version: %d.%d Received version %d.%d\n",
+ PM_MAJOR_VERSION, PM_MINOR_VERSION,
+ msg->version.major_version, msg->version.minor_version);
+ schedule_work(&mic_ctx->micpm_ctx.pm_close);
+ goto inval;
+ }
+
+ mic_ctx->micpm_ctx.pm_options.pc3_enabled = msg->pc3_enabled;
+ mic_ctx->micpm_ctx.pm_options.pc6_enabled = msg->pc6_enabled;
+
+ mic_ctx->micpm_ctx.pc3_enabled =
+ (mic_ctx->micpm_ctx.pm_options.pc3_enabled)? true : false;
+ mic_ctx->micpm_ctx.pc6_enabled =
+ (mic_ctx->micpm_ctx.pm_options.pc6_enabled)? true : false;
+
+ mic_ctx->micpm_ctx.con_state = PM_CONNECTED;
+
+inval:
+ PM_EXIT;
+ return err;
+}
+
+/*
+ * Message handler invoked by the per device receive workqueue when it receives
+ * a message from the device.
+ */
+int
+mic_pm_handle_message(mic_ctx_t *mic_ctx, pm_recv_msg_t *recv_msg)
+{
+ int res = 0;
+
+ if(mic_ctx == NULL) {
+ return -EINVAL;
+ }
+
+ if(recv_msg == NULL) {
+ PM_DEBUG("Undefined message\n");
+ return -EINVAL;
+ }
+
+ switch(recv_msg->msg_header.opcode) {
+ case PM_MESSAGE_PC3READY:
+ res = handle_pc3_ready(mic_ctx);
+ break;
+ case PM_MESSAGE_OPEN_ACK:
+ /*Size of the payload needs to be equal to what the
+ * host is trying to cast it to
+ */
+ if (sizeof(pm_msg_pm_options) != recv_msg->msg_header.len) {
+ printk(KERN_ERR "Incompatible PM message. Opcode = %d\n",
+ recv_msg->msg_header.opcode);
+ return -EINVAL;
+ }
+ res = handle_open_ack(mic_ctx,
+ ((pm_msg_pm_options *) recv_msg->msg_body));
+ break;
+ default:
+ printk(KERN_ERR "Unknown PM message. Opcode = %d\n",
+ recv_msg->msg_header.opcode);
+ break;
+ }
+ return res;
+}
+
+/*
+ * retrieve_msg:
+ *
+ * Retrieve message from the head of list.
+ * @mic_ctx: The device context
+ * Returns the retrieved message.
+ */
+pm_recv_msg_t *
+pm_retrieve_msg(mic_ctx_t *mic_ctx) {
+
+ pm_recv_msg_t *recv_msg = NULL;
+ struct list_head *pos, *tmpq;
+ bool msg_found = false;
+
+ mutex_lock(&mic_ctx->micpm_ctx.msg_mutex);
+ if (!list_empty_careful(&mic_ctx->micpm_ctx.msg_list))
+ {
+ list_for_each_safe(pos, tmpq, &mic_ctx->micpm_ctx.msg_list) {
+ recv_msg = list_entry(pos, pm_recv_msg_t, msg);
+ /*Do not touch the message if its a test message */
+ if (recv_msg->msg_header.opcode != PM_MESSAGE_TEST) {
+ list_del(&recv_msg->msg);
+ msg_found = true;
+ break;
+ }
+ }
+ }
+
+ if (msg_found == false)
+ recv_msg = NULL;
+
+ mutex_unlock(&mic_ctx->micpm_ctx.msg_mutex);
+ return recv_msg;
+}
+
+/*
+ * pm_process_msg_list:
+ *
+ * Process the message list of a node and handle each message in the list.
+ * @mic_ctx[in]: The deive context whose message list is to be processed
+ * Returns: None
+ */
+void
+pm_process_msg_list(mic_ctx_t *mic_ctx) {
+
+ pm_recv_msg_t *process_msg = NULL;
+ int ret = 0;
+
+ if(mic_ctx == NULL) {
+ PM_DEBUG("Cannot get device handle \n");
+ return;
+ }
+
+ while(!list_empty(&mic_ctx->micpm_ctx.msg_list)) {
+ process_msg = pm_retrieve_msg(mic_ctx);
+ if(!process_msg) {
+ PM_DEBUG("No Message to process.\n");
+ return;
+ }
+
+ ret = mic_pm_handle_message(mic_ctx, process_msg);
+ if(ret) {
+ PM_DEBUG("Power Management message not processed"
+ " successfully.\n");
+ }
+
+ if(process_msg->msg_body != NULL) {
+ kfree(process_msg->msg_body);
+ }
+ kfree(process_msg);
+ }
+}
+
+/*
+ * Retrieves each message from the message list and calls the handler
+ * for the same. After the handler returns, the message is removed
+ * from the list and deleted.
+ */
+static void
+mic_pm_msg_handle_work(struct work_struct *msg_handle_work)
+{
+ pm_wq_t *pm_wq = container_of(msg_handle_work, pm_wq_t, work);
+ micpm_ctx_t *pm_ctx = container_of(pm_wq, micpm_ctx_t, handle_msg);
+ mic_ctx_t *mic_ctx = container_of(pm_ctx, mic_ctx_t, micpm_ctx);
+ pm_process_msg_list(mic_ctx);
+ return;
+}
+
+static void
+pc6_entry_work(struct work_struct *work)
+{
+ int err;
+ micpm_ctx_t *pm_ctx =
+ container_of(to_delayed_work(work),
+ micpm_ctx_t, pc6_entry_work);
+ mic_ctx_t *mic_ctx = container_of(pm_ctx, mic_ctx_t, micpm_ctx);
+
+ err = pc6_entry_start(mic_ctx);
+ if (err == -EAGAIN)
+ queue_delayed_work(mic_ctx->micpm_ctx.pc6_entry_wq,
+ &mic_ctx->micpm_ctx.pc6_entry_work,
+ mic_ctx->micpm_ctx.pc6_timeout*HZ);
+ return;
+}
+
+/*
+ * Called when a device creates a PM connection to Host. There can be
+ * only one PM connection between Host and a device. The function checks
+ * for an existing connection and rejects this new request if present.
+ */
+static void
+mic_pm_accept_work(struct work_struct *work)
+{
+ scif_epd_t newepd;
+ struct scif_portID portID;
+ int err;
+ uint16_t i;
+ mic_ctx_t *mic_ctx;
+ mic_data_t *mic_data_p = &mic_data;
+
+ PM_DEBUG("Accept thread waiting for new PM connections\n");
+ err = scif_accept(mic_data.dd_pm.epd, &portID, &newepd, SCIF_ACCEPT_SYNC);
+ if (err == -EBUSY || err == -ENODEV) {
+ PM_DEBUG("scif_accept error %d\n", err);
+ goto continue_accepting;
+ }
+ else if (err < 0) {
+ PM_DEBUG("scif_accept failed with errno %d\n", err);
+ goto exit;
+
+ }
+ PM_DEBUG("Connection request received. \n");
+
+ mutex_lock(&mic_data.dd_pm.pm_accept_mutex);
+
+ if (newepd->peer.node == SCIF_HOST_NODE) {
+ /* Reject connection request from HOST itself */
+ PM_DEBUG("PM: Peer node cannot be HOST. Peer Node = %d Peer Port = %d",
+ newepd->peer.node, newepd->peer.port);
+ scif_close(newepd);
+ mutex_unlock(&mic_data.dd_pm.pm_accept_mutex);
+ goto continue_accepting;
+ }
+
+ /*Only one Power Management connection per node. */
+ for (i = 0; i < mic_data_p->dd_numdevs; i++) {
+ mic_ctx = get_per_dev_ctx(i);
+ if (mic_ctx != NULL) {
+ if (mic_ctx->micpm_ctx.pm_epd != NULL) {
+ if (mic_ctx->micpm_ctx.pm_epd->peer.node == newepd->peer.node) {
+ PM_DEBUG("There is already Power Management connection"
+ " established from this node. Rejecting request.\n");
+ PM_DEBUG("Peer Node = %d, Peer Port = %d\n",
+ mic_ctx->micpm_ctx.pm_epd->peer.node,
+ mic_ctx->micpm_ctx.pm_epd->peer.port);
+ scif_close(newepd);
+ mutex_unlock(&mic_data.dd_pm.pm_accept_mutex);
+ goto continue_accepting;
+ }
+ }
+ }
+
+ }
+ mutex_unlock(&mic_data.dd_pm.pm_accept_mutex);
+ mic_ctx = get_per_dev_ctx(newepd->peer.node -1);
+ mic_ctx->micpm_ctx.pm_epd = newepd;
+ micpm_start(mic_ctx);
+
+
+continue_accepting:
+ mutex_lock(&mic_data.dd_pm.pm_accept_mutex);
+ queue_work(mic_data.dd_pm.accept.wq,
+ &mic_data.dd_pm.accept.work);
+ mutex_unlock(&mic_data.dd_pm.pm_accept_mutex);
+exit:
+ return;
+}
+
+/*
+ * Work item function that waits for incoming PM messages from
+ * a node. The function adds the message to a per device message
+ * list that is later processed by the message handler.
+ */
+static void
+mic_pm_recv_work(struct work_struct *recv_work)
+{
+ int err = 0;
+ int size = 0;
+
+ pm_wq_t *pm_wq = container_of(recv_work, pm_wq_t, work);
+ micpm_ctx_t *pm_ctx = container_of(pm_wq, micpm_ctx_t, recv);
+ mic_ctx_t *mic_ctx = container_of(pm_ctx, mic_ctx_t, micpm_ctx);
+ pm_recv_msg_t *recv_msg = NULL;
+
+ if (mic_ctx == NULL || pm_ctx == NULL) {
+ PM_DEBUG("Error retrieving driver context \n");
+ goto unqueue;
+ }
+
+ size = sizeof(pm_msg_header);
+ recv_msg = (void *)kmalloc(sizeof(pm_recv_msg_t), GFP_KERNEL);
+
+ if (recv_msg == NULL) {
+ PM_DEBUG("Error allocating memory to save receive message.\n");
+ goto unqueue;
+ }
+ INIT_LIST_HEAD(&recv_msg->msg);
+ recv_msg->msg_body = NULL;
+
+ /*Get the header */
+ err = mic_pm_recv(mic_ctx, &recv_msg->msg_header, size);
+ if (err < 0) {
+ PM_DEBUG("Error in scif_recv while waiting for PM header message.\n");
+ if (err == -ECONNRESET) {
+ /*Remote node is not in a connected state. */
+ schedule_work(&mic_ctx->micpm_ctx.pm_close);
+ }
+ goto unqueue;
+
+ }
+
+ if(recv_msg->msg_header.len != 0) {
+ PM_DEBUG("Retrieving %d bytes of message body\n", recv_msg->msg_header.len);
+ recv_msg->msg_body = (void *)kmalloc((sizeof(char) * recv_msg->msg_header.len), GFP_KERNEL);
+ if (recv_msg->msg_body == NULL) {
+ PM_DEBUG("Error allocating memory to receive PM Message\n");
+ goto unqueue;
+ }
+ err = mic_pm_recv(mic_ctx, recv_msg->msg_body, recv_msg->msg_header.len);
+ if (err < 0) {
+ PM_DEBUG("Error in scif_recv while waiting for PM message body\n");
+ if (err == -ECONNRESET) {
+ /*Remote node is not in a connected state. */
+ schedule_work(&mic_ctx->micpm_ctx.pm_close);
+ }
+ goto unqueue;
+ }
+ }
+
+ if(mic_data.dd_pm.enable_pm_logging) {
+ micpm_display_message(mic_ctx,&recv_msg->msg_header,
+ recv_msg->msg_body,"RECV");
+ }
+
+ if ((recv_msg->msg_header.opcode != PM_MESSAGE_CLOSE) &&
+ ((recv_msg->msg_header.opcode != PM_MESSAGE_CLOSE_ACK))){
+ PM_DEBUG("Adding received message from node %d to list.\n",
+ mic_ctx->bi_id+1);
+ mutex_lock(&mic_ctx->micpm_ctx.msg_mutex);
+ list_add_tail(&recv_msg->msg , &mic_ctx->micpm_ctx.msg_list);
+ mutex_unlock(&mic_ctx->micpm_ctx.msg_mutex);
+
+ if(likely(recv_msg->msg_header.opcode != PM_MESSAGE_TEST)) {
+ PM_DEBUG("Queue message handler work for node: %d\n",mic_ctx->bi_id+1);
+ queue_work(mic_ctx->micpm_ctx.handle_msg.wq,
+ &mic_ctx->micpm_ctx.handle_msg.work);
+ }
+
+ queue_work(mic_ctx->micpm_ctx.recv.wq,
+ &mic_ctx->micpm_ctx.recv.work);
+ } else {
+
+ if (recv_msg->msg_header.opcode == PM_MESSAGE_CLOSE) {
+ mic_pm_send_msg(mic_ctx , PM_MESSAGE_CLOSE_ACK, NULL, 0);
+ mic_ctx->micpm_ctx.con_state = PM_DISCONNECTING;
+ schedule_work(&mic_ctx->micpm_ctx.pm_close);
+ } else {
+ mic_ctx->micpm_ctx.con_state = PM_DISCONNECTING;
+ wake_up(&mic_ctx->micpm_ctx.disc_wq);
+ }
+ goto unqueue;
+ }
+ return;
+unqueue:
+ if (recv_msg) {
+ if (recv_msg->msg_body)
+ kfree(recv_msg->msg_body);
+ kfree(recv_msg);
+ }
+ return;
+}
+
+/*
+ * Work item to handle closing of PM end point to a device and all the
+ * related receive workqueues.
+ */
+static void
+mic_pm_close_work(struct work_struct *work)
+{
+ micpm_ctx_t *pm_ctx = container_of(work, micpm_ctx_t, pm_close);
+ mic_ctx_t *mic_ctx = container_of(pm_ctx, mic_ctx_t, micpm_ctx);
+ micpm_stop(mic_ctx);
+ return;
+}
+
+static void
+mic_pm_resume_work(struct work_struct *resume_work)
+{
+ int err;
+ pm_wq_t *pm_wq = container_of(resume_work, pm_wq_t, work);
+ micpm_ctx_t *pm_ctx = container_of(pm_wq, micpm_ctx_t, resume);
+ mic_ctx_t *mic_ctx = container_of(pm_ctx, mic_ctx_t, micpm_ctx);
+
+ if (mic_ctx != NULL) {
+ err = pm_start_device(mic_ctx);
+ if (err) {
+ PM_DEBUG("Failed to start device %d after resume\n",
+ mic_ctx->bi_id);
+ }
+ } else {
+ PM_DEBUG("Error retrieving node context.\n");
+ }
+}
+
+/* Create PM specific workqueues during driver probe.
+ *
+ * Receive workqueue will store the received message and kick-off
+ * a message handler workqueue which will process them.
+ *
+ * Resume workqueue handles the task of booting uOS rduring
+ * OSPM resume/restore phase.
+ */
+int
+setup_pm_workqueues(mic_ctx_t *mic_ctx)
+{
+ int err = 0;
+
+ if(!mic_ctx) {
+ PM_DEBUG("Failed to retrieve device context\n");
+ err = -EINVAL;
+ goto err;
+ }
+
+ /* setup resume wq */
+ snprintf(mic_ctx->micpm_ctx.resume.wq_name,
+ sizeof(mic_ctx->micpm_ctx.resume.wq_name),
+ "PM_RESUME_WQ %d", mic_get_scifnode_id(mic_ctx));
+
+ if (!(mic_ctx->micpm_ctx.resume.wq
+ = __mic_create_singlethread_workqueue(
+ mic_ctx->micpm_ctx.resume.wq_name))) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ /* Setup Receive wq */
+ snprintf(mic_ctx->micpm_ctx.recv.wq_name,
+ sizeof(mic_ctx->micpm_ctx.recv.wq_name),
+ "RECV_WORK_Q %d", mic_get_scifnode_id(mic_ctx));
+
+ if (!(mic_ctx->micpm_ctx.recv.wq
+ = __mic_create_singlethread_workqueue(
+ mic_ctx->micpm_ctx.recv.wq_name))) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ /* Setup Msg handler wq */
+ snprintf(mic_ctx->micpm_ctx.handle_msg.wq_name,
+ sizeof(mic_ctx->micpm_ctx.handle_msg.wq_name),
+ "MSG_HANDLER_WQ %d", mic_get_scifnode_id(mic_ctx));
+
+ if (!(mic_ctx->micpm_ctx.handle_msg.wq
+ = __mic_create_singlethread_workqueue(
+ mic_ctx->micpm_ctx.handle_msg.wq_name))) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ /* Setup pc6 entry wq */
+ snprintf(mic_ctx->micpm_ctx.pc6_wq_name,
+ sizeof(mic_ctx->micpm_ctx.pc6_wq_name),
+ "PC6_WORK_Q %d", mic_get_scifnode_id(mic_ctx));
+
+ if (!(mic_ctx->micpm_ctx.pc6_entry_wq
+ = __mic_create_singlethread_workqueue(
+ mic_ctx->micpm_ctx.pc6_wq_name))) {
+ err = -ENOMEM;
+ goto err;
+ }
+ INIT_WORK(&mic_ctx->micpm_ctx.recv.work, mic_pm_recv_work);
+ INIT_WORK(&mic_ctx->micpm_ctx.handle_msg.work, mic_pm_msg_handle_work);
+ INIT_WORK(&mic_ctx->micpm_ctx.pm_close, mic_pm_close_work);
+ INIT_WORK(&mic_ctx->micpm_ctx.resume.work, mic_pm_resume_work);
+ INIT_DELAYED_WORK(&mic_ctx->micpm_ctx.pc6_entry_work, pc6_entry_work);
+
+err:
+ return err;
+}
+/*Power Management Initialization function. Sets up SCIF
+ * end points and accept threads.
+ */
+int micpm_init()
+{
+ scif_epd_t epd;
+ int con_port;
+ int err = 0;
+
+ epd = scif_open();
+ if (epd == SCIF_OPEN_FAILED || epd == NULL) {
+ PM_DEBUG("scif_open failed\n");
+ return -1;
+ }
+
+ if ((con_port = scif_bind(epd, SCIF_PM_PORT_0)) < 0) {
+ PM_DEBUG("scif_bind to port failed with error %d\n", con_port);
+ err = con_port;
+ goto exit_close;
+ }
+
+ /*No real upper limit on number of connections.
+ Once scif_listen accepts 0 as an acceptable parameter for max
+ connections(to mean tht there is no upper limit), change this. */
+ if ((err = scif_listen(epd, 100)) < 0) {
+ PM_DEBUG("Listen ioctl failed with error %d\n", err);
+ goto exit_close;
+ }
+ mic_data.dd_pm.epd = epd;
+
+ snprintf(mic_data.dd_pm.accept.wq_name,
+ sizeof(mic_data.dd_pm.accept.wq_name),"PM ACCEPT");
+
+ mic_data.dd_pm.accept.wq =
+ __mic_create_singlethread_workqueue(mic_data.dd_pm.accept.wq_name);
+ if (!mic_data.dd_pm.accept.wq){
+ err = -ENOMEM;
+ PM_DEBUG("create workqueue returned null\n");
+ goto exit_close;
+ }
+ INIT_WORK(&mic_data.dd_pm.accept.work, mic_pm_accept_work);
+ mutex_init (&mic_data.dd_pm.pm_accept_mutex);
+ mutex_init (&mic_data.dd_pm.pm_idle_mutex);
+ atomic_set(&mic_data.dd_pm.connected_clients, 0);
+
+ /*Add work to the work queue */
+ queue_work(mic_data.dd_pm.accept.wq,
+ &mic_data.dd_pm.accept.work);
+ mic_data.dd_pm.enable_pm_logging = 0;
+ atomic_set(&mic_data.dd_pm.wakeup_in_progress, 0);
+
+ micpm_dbg_parent_init();
+
+ return err;
+
+exit_close:
+ scif_close(epd);
+ return err;
+}
+
+/*
+ * Close the SCIF acceptor endpoint and uninit a lot of driver level
+ * data structures including accept threads,
+ */
+void
+micpm_uninit(void)
+{
+ int err;
+ scif_epd_t epd = mic_data.dd_pm.epd;
+
+ if(atomic_read(&mic_data.dd_pm.connected_clients) > 0) {
+ PM_DEBUG("connected_clients is nonzero (%d)\n",
+ atomic_read(&mic_data.dd_pm.connected_clients));
+ }
+ err = scif_close(epd);
+ if (err != 0) {
+ PM_DEBUG("Scif_close failed with error %d\n",err);
+ }
+
+ if (mic_data.dd_pm.accept.wq != NULL) {
+ PM_DEBUG("Flushing accept workqueue\n");
+ flush_workqueue(mic_data.dd_pm.accept.wq);
+ destroy_workqueue(mic_data.dd_pm.accept.wq);
+ mic_data.dd_pm.accept.wq = NULL;
+ }
+
+ mutex_destroy(&mic_data.dd_pm.pm_accept_mutex);
+ mutex_destroy(&mic_data.dd_pm.pm_idle_mutex);
+
+ debugfs_remove_recursive(mic_data.dd_pm.pmdbgparent_dir);
+
+}
+
+/*
+ * Open the Per device Power Management context.
+ */
+int
+micpm_probe(mic_ctx_t * mic_ctx) {
+
+ int err = 0;
+
+ mic_ctx->micpm_ctx.pm_epd = NULL;
+ mic_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_PC0;
+ mic_ctx->micpm_ctx.recv.wq = NULL;
+ mic_ctx->micpm_ctx.handle_msg.wq = NULL;
+ mic_ctx->micpm_ctx.mic_suspend_state = MIC_RESET;
+ mic_ctx->micpm_ctx.pc3_enabled = true;
+ mic_ctx->micpm_ctx.pc6_enabled = true;
+ mic_ctx->micpm_ctx.pm_options.pc3_enabled = 0;
+ mic_ctx->micpm_ctx.pm_options.pc6_enabled = 0;
+
+ if ((err = setup_pm_workqueues(mic_ctx)))
+ goto err;
+
+ mutex_init (&mic_ctx->micpm_ctx.msg_mutex);
+ INIT_LIST_HEAD(&mic_ctx->micpm_ctx.msg_list);
+ init_waitqueue_head(&mic_ctx->micpm_ctx.disc_wq);
+ atomic_set(&mic_ctx->micpm_ctx.pm_ref_cnt, 0);
+ mic_ctx->micpm_ctx.pc6_timeout = PC6_TIMER;
+
+ /* create debugfs entries*/
+ micpm_dbg_init(mic_ctx);
+
+err:
+ return err;
+}
+
+int
+micpm_remove(mic_ctx_t * mic_ctx) {
+
+ debugfs_remove_recursive(mic_ctx->micpm_ctx.pmdbg_dir);
+
+ if (mic_ctx->micpm_ctx.resume.wq != NULL) {
+ destroy_workqueue(mic_ctx->micpm_ctx.resume.wq);
+ mic_ctx->micpm_ctx.resume.wq = NULL;
+ }
+
+ if(mic_ctx->micpm_ctx.pc6_entry_wq != NULL) {
+ destroy_workqueue(mic_ctx->micpm_ctx.pc6_entry_wq);
+ mic_ctx->micpm_ctx.pc6_entry_wq = NULL;
+ }
+
+ if(mic_ctx->micpm_ctx.recv.wq != NULL) {
+ destroy_workqueue(mic_ctx->micpm_ctx.recv.wq);
+ mic_ctx->micpm_ctx.recv.wq = NULL;
+ }
+
+ if(mic_ctx->micpm_ctx.handle_msg.wq != NULL) {
+ destroy_workqueue(mic_ctx->micpm_ctx.handle_msg.wq);
+ mic_ctx->micpm_ctx.handle_msg.wq = NULL;
+ }
+
+ micpm_nodemask_uninit(mic_ctx);
+
+ mutex_destroy(&mic_ctx->micpm_ctx.msg_mutex);
+ return 0;
+}
+
+int
+micpm_start(mic_ctx_t *mic_ctx) {
+
+ int ref_cnt;
+ mic_ctx->micpm_ctx.con_state = PM_CONNECTING;
+
+ /* queue receiver */
+ queue_work(mic_ctx->micpm_ctx.recv.wq,
+ &mic_ctx->micpm_ctx.recv.work);
+
+ atomic_inc(&mic_data.dd_pm.connected_clients);
+ if ((ref_cnt = atomic_read(&mic_ctx->micpm_ctx.pm_ref_cnt)))
+ printk("Warning: PM ref_cnt is non-zero during start. "
+ "ref_cnt = %d PM features may not work as expected\n",
+ ref_cnt);
+ mic_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_PC0;
+ set_host_state(mic_ctx, PM_IDLE_STATE_PC0);
+ return mic_pm_send_msg(mic_ctx , PM_MESSAGE_OPEN, NULL, 0);
+}
+
+/*
+ * Close the per device Power management context here.
+ * It does various things such as: closing scif endpoints,
+ * delete pending work items and wait for those that are
+ * executing to complete, delete pending messages in the
+ * message list, delete pending timers and wait for runnig
+ * timers to complete. The function can block.
+ */
+int
+micpm_stop(mic_ctx_t *mic_ctx) {
+
+ int err = 0;
+ int node_lost = 0;
+ if(mic_ctx == NULL) {
+ PM_DEBUG("Mic context not Initialized\n");
+ return -EINVAL;
+ }
+
+ if ((micpm_get_reference(mic_ctx, true))) {
+ PM_DEBUG("get_reference failed. Node may be lost\n");
+ node_lost = 1;
+ }
+
+ mutex_lock(&mic_data.dd_pm.pm_accept_mutex);
+ if ((mic_ctx->micpm_ctx.con_state == PM_CONNECTED) &&
+ (mic_ctx->state != MIC_LOST)) {
+ if (!mic_pm_send_msg(mic_ctx, PM_MESSAGE_CLOSE, NULL, 0)) {
+ err = wait_event_timeout(
+ mic_ctx->micpm_ctx.disc_wq,
+ mic_ctx->micpm_ctx.con_state == PM_DISCONNECTING,
+ NODE_ALIVE_TIMEOUT);
+ if (!err) {
+ PM_DEBUG("Timed out waiting CLOSE ACK"
+ " from node.\n");
+ }
+ }
+ }
+
+ if(mic_ctx->micpm_ctx.pm_epd != NULL) {
+ PM_DEBUG("Power Management: Closing connection to"
+ " node: %d port:%d\n", mic_ctx->micpm_ctx.pm_epd->peer.node,
+ mic_ctx->micpm_ctx.pm_epd->peer.port);
+ err = scif_close(mic_ctx->micpm_ctx.pm_epd);
+ if(err!= 0)
+ PM_DEBUG("Scif_close failed with error %d\n",err);
+ mic_ctx->micpm_ctx.pm_epd = NULL;
+ micpm_decrement_clients();
+ }
+ mic_ctx->micpm_ctx.con_state = PM_DISCONNECTED;
+ mic_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_PC0;
+ flush_workqueue(mic_ctx->micpm_ctx.resume.wq);
+ flush_workqueue(mic_ctx->micpm_ctx.recv.wq);
+ flush_workqueue(mic_ctx->micpm_ctx.handle_msg.wq);
+ cancel_delayed_work_sync(&mic_ctx->micpm_ctx.pc6_entry_work);
+
+ /* Process messages in message queue */
+ pm_process_msg_list(mic_ctx);
+
+ if (!node_lost)
+ micpm_put_reference(mic_ctx);
+ mutex_unlock(&mic_data.dd_pm.pm_accept_mutex);
+ return err;
+}
+
+/*
+ * Function to load the uOS and start all the driver components
+ * after a resume/restore operation
+ */
+int
+pm_start_device(mic_ctx_t *mic_ctx)
+{
+ if (!mic_ctx) {
+ PM_DEBUG("Error retreving driver context\n");
+ return 0;
+ }
+
+ PM_DEBUG("Resume MIC device:%d\n", mic_ctx->bi_id);
+ /* Make sure the Power reset during Resume/Restore is complete*/
+ adapter_wait_reset(mic_ctx);
+ wait_for_reset(mic_ctx);
+
+ /*Perform software reset */
+ adapter_reset(mic_ctx, RESET_WAIT, !RESET_REATTEMPT);
+ wait_for_reset(mic_ctx);
+
+ /* Boot uOS only if it was online before suspend */
+ if (MIC_ONLINE == mic_ctx->micpm_ctx.mic_suspend_state) {
+ if(adapter_start_device(mic_ctx)) {
+ PM_DEBUG("booting uos... failed\n");
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Function to stop all the driver components and unload the uOS
+ * during a suspend/hibernate operation
+ */
+int
+pm_stop_device(mic_ctx_t *mic_ctx)
+{
+ if (!mic_ctx) {
+ PM_DEBUG("Error retreving driver context\n");
+ return 0;
+ }
+
+ mic_ctx->micpm_ctx.mic_suspend_state = mic_ctx->state;
+
+ PM_DEBUG("Suspend MIC device:#%d\n", mic_ctx->bi_id);
+ if (MIC_ONLINE == mic_ctx->micpm_ctx.mic_suspend_state) {
+ adapter_shutdown_device(mic_ctx);
+ if (!wait_for_shutdown_and_reset(mic_ctx)) {
+ /* Shutdown failed. Fall back on forced reset */
+ adapter_stop_device(mic_ctx, RESET_WAIT, !RESET_REATTEMPT);
+ wait_for_reset(mic_ctx);
+ }
+ }
+ else {
+ /* If card is in any state but ONLINE, make sure card stops */
+ adapter_stop_device(mic_ctx, RESET_WAIT, !RESET_REATTEMPT);
+ wait_for_reset(mic_ctx);
+ }
+
+ mutex_lock(&mic_ctx->state_lock);
+ mic_ctx->state = MIC_RESET;
+ mutex_unlock(&mic_ctx->state_lock);
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/* contains code to handle MIC IO control codes */
+
+
+#include "mic_common.h"
+#include <linux/module.h>
+
+/* helper methods for debugging/unit testing /*/
+static int check_test_msg(mic_ctx_t *mic_ctx, void *buf, uint32_t len);
+
+
+
+#define PM_MMIO_REGVALUE_GET(_name, _offset) \
+int get_##_name(void *data, uint64_t *value) \
+{ \
+ uint64_t bid; \
+ mic_ctx_t *mic_ctx; \
+ \
+ bid = (uint64_t)data; \
+ if (bid >= mic_data.dd_numdevs) { \
+ return -EINVAL; \
+ } \
+ mic_ctx = get_per_dev_ctx(bid); \
+ if (!mic_ctx) { \
+ printk("DD"); \
+ return -EINVAL; \
+ } \
+ \
+ *value = pm_reg_read(mic_ctx, _offset); \
+ return 0; \
+} \
+DEFINE_SIMPLE_ATTRIBUTE(fops_##_name, get_##_name, NULL, "%llu"); \
+
+static PM_MMIO_REGVALUE_GET(svidctrl, SBOX_SVID_CONTROL);
+static PM_MMIO_REGVALUE_GET(pcuctrl, SBOX_PCU_CONTROL);
+static PM_MMIO_REGVALUE_GET(hoststate,SBOX_HOST_PMSTATE);
+static PM_MMIO_REGVALUE_GET(cardstate, SBOX_UOS_PMSTATE);
+static PM_MMIO_REGVALUE_GET(wtimer, SBOX_C3WAKEUP_TIMER);
+static PM_MMIO_REGVALUE_GET(gpmctrl, GBOX_PM_CTRL);
+static PM_MMIO_REGVALUE_GET(core_volt, SBOX_COREVOLT);
+static PM_MMIO_REGVALUE_GET(uos_pcuctrl, SBOX_UOS_PCUCONTROL);
+
+static int depgraph_j2i_show(struct seq_file *s, void *pos)
+{
+ uint64_t bid = (uint64_t)s->private;
+ mic_ctx_t *mic_ctx;
+ int i, j;
+
+ mic_ctx = get_per_dev_ctx(bid);
+ if (!mic_ctx) {
+ return -EINVAL;
+ }
+
+ seq_printf(s,"=================================================================\n");
+ seq_printf(s,"%-10s |%-25s\n", "Scif Node" , "dependent nodes");
+ seq_printf(s,"=================================================================\n");
+
+ for ( i = 0; i <= ms_info.mi_maxid; i++) {
+ seq_printf(s, "%-10d |", i);
+ for (j = 0; j <= ms_info.mi_maxid; j++) {
+ switch(ms_info.mi_depmtrx[j][i]) {
+ case DEP_STATE_DEPENDENT:
+ {
+ /* (A) - active dependency on node i */
+ seq_printf(s, "%d(A),", j);
+ break;
+ }
+ case DEP_STATE_DISCONNECT_READY:
+ {
+ /* (R) - node j has sent PC6 ready message to the host
+ * dependency is not active so node i can go idle
+ */
+ seq_printf(s, "%d(R),", j);
+ break;
+ }
+ case DEP_STATE_DISCONNECTED:
+ {
+ /* (D) - node j is in idle state.
+ * dependency is not active so node i can go idle
+ */
+ seq_printf(s, "%d(D),", j);
+ break;
+ }
+ }
+ }
+ seq_printf(s,"\n=================================================================\n");
+ }
+
+ return 0;
+}
+
+static int depgraph_j2i_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, depgraph_j2i_show, inode->i_private);
+}
+
+static struct file_operations depgraph_j2i_file_ops = {
+ .owner = THIS_MODULE,
+ .open = depgraph_j2i_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release
+};
+
+static int depgraph_i2j_show(struct seq_file *s, void *pos)
+{
+ uint64_t bid = (uint64_t)s->private;
+ mic_ctx_t *mic_ctx;
+ int i, j;
+
+ mic_ctx = get_per_dev_ctx(bid);
+ if (!mic_ctx) {
+ return -EINVAL;
+ }
+
+ seq_printf(s,"=================================================================\n");
+ seq_printf(s,"%-10s |%-25s\n", "Scif Node" , "is dependent on Nodes");
+ seq_printf(s,"=================================================================\n");
+
+ for ( i = 0; i <= ms_info.mi_maxid; i++) {
+ seq_printf(s, "%-10d |", i);
+ for (j = 0; j <= ms_info.mi_maxid; j++) {
+ switch(ms_info.mi_depmtrx[i][j]) {
+ case DEP_STATE_DEPENDENT:
+ {
+ /* (A) - active dependency on node j */
+ seq_printf(s, "%d(A),", j);
+ break;
+ }
+ case DEP_STATE_DISCONNECT_READY:
+ {
+ /* (R) - node j has sent PC6 ready message to the host */
+ seq_printf(s, "%d(R),", j);
+ break;
+ }
+ case DEP_STATE_DISCONNECTED:
+ {
+ /* (D) - node j is in idle state.
+ * This should not happen unless node i itself is in idle state
+ */
+ seq_printf(s, "%d(D),", j);
+ break;
+ }
+ }
+ }
+ seq_printf(s,"\n=================================================================\n");
+ }
+
+ return 0;
+}
+
+static int depgraph_i2j_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, depgraph_i2j_show, inode->i_private);
+}
+
+static struct file_operations depgraph_i2j_file_ops = {
+ .owner = THIS_MODULE,
+ .open = depgraph_i2j_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release
+};
+
+static int connection_info_show(struct seq_file *s, void *pos) {
+
+ uint64_t bid = (uint64_t)s->private;
+ mic_ctx_t *mic_ctx;
+ int count = 0;
+ struct list_head *position, *tmpq;
+
+ mic_ctx = get_per_dev_ctx(bid);
+ if (!mic_ctx) {
+ return -EINVAL;
+ }
+
+ seq_printf(s,"=========================================================================\n");
+ if(mic_ctx->micpm_ctx.pm_epd != NULL) {
+ seq_printf(s, "%-35s | %35d\n", "Local Node", mic_ctx->micpm_ctx.pm_epd->port.node);
+ seq_printf(s, "%-35s | %35d\n", "Local Port", mic_ctx->micpm_ctx.pm_epd->port.port);
+ seq_printf(s, "%-35s | %35d\n", "Remote Node", mic_ctx->micpm_ctx.pm_epd->peer.node);
+ seq_printf(s, "%-35s | %35d\n", "Remote Port", mic_ctx->micpm_ctx.pm_epd->peer.port);
+ seq_printf(s, "%-35s | %35d\n", "Connection state", mic_ctx->micpm_ctx.pm_epd->state);
+ if(!list_empty(&mic_ctx->micpm_ctx.msg_list)) {
+ list_for_each_safe(position, tmpq, &mic_ctx->micpm_ctx.msg_list) {
+ count++;
+ }
+ } else {
+ count = 0;
+ }
+ seq_printf(s, "%-35s | %35d\n", "Messages in queue", count);
+ } else {
+ seq_printf(s, "%s\n", "No PM connection found");
+ }
+ seq_printf(s,"=========================================================================\n");
+
+ return 0;
+}
+
+static int connection_info_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, connection_info_show, inode->i_private);
+}
+
+static struct file_operations connection_info_file_ops = {
+ .owner = THIS_MODULE,
+ .open = connection_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release
+};
+
+static int active_set_show(struct seq_file *s, void *pos) {
+
+ uint64_t bid = (uint64_t)s->private;
+ mic_ctx_t *mic_ctx;
+ int i, j = 0;
+ uint8_t *nodemask;
+ uint8_t *temp_buf_ptr;
+
+ mic_ctx = get_per_dev_ctx(bid);
+ if (!mic_ctx) {
+ return -EINVAL;
+ }
+
+ nodemask = (uint8_t*) kzalloc(mic_ctx->micpm_ctx.nodemask.len, GFP_KERNEL);
+ if (!nodemask) {
+ seq_printf(s, "%s\n", "Cannot allocate buffer");
+ return 0;
+ }
+
+ if ((micscif_get_activeset(mic_ctx->bi_id + 1, nodemask))) {
+ seq_printf(s, "%s\n", "Cannot calculate activation set");
+ kfree(nodemask);
+ return 0;
+ }
+
+ seq_printf(s, "%s\n", "Nodes in activation set:");
+ temp_buf_ptr = nodemask;
+ for ( i = 0; i < mic_ctx->micpm_ctx.nodemask.len; i++) {
+ temp_buf_ptr = nodemask + i;
+ for (j = 0; j < 8; j++) {
+ if (*temp_buf_ptr & (1ULL << j))
+ seq_printf(s, "%d ", j + (i * 8));
+ }
+ }
+ seq_printf(s, "\n");
+ kfree(nodemask);
+ return 0;
+}
+
+static int active_set_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, active_set_show, inode->i_private);
+}
+
+static struct file_operations activation_set_file_ops = {
+ .owner = THIS_MODULE,
+ .open = active_set_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release
+};
+
+static int deactive_set_show(struct seq_file *s, void *pos) {
+
+ uint64_t bid = (uint64_t)s->private;
+ mic_ctx_t *mic_ctx;
+ int i, j;
+ uint8_t *nodemask;
+ uint8_t *temp_buf_ptr;
+
+ mic_ctx = get_per_dev_ctx(bid);
+ if (!mic_ctx) {
+ return -EINVAL;
+ }
+
+ nodemask = (uint8_t*) kzalloc(mic_ctx->micpm_ctx.nodemask.len, GFP_KERNEL);
+ if (!nodemask) {
+ seq_printf(s, "%s\n", "Cannot allocate buffer");
+ return 0;
+ }
+
+ if ((micscif_get_deactiveset(mic_ctx->bi_id +1, nodemask, 1))) {
+ seq_printf(s, "%s\n", "Cannot calculate activation set");
+ kfree(nodemask);
+ return 0;
+ }
+
+ seq_printf(s, "%s\n", "Nodes in deactivation set:");
+ temp_buf_ptr = nodemask;
+ for ( i = 0; i < mic_ctx->micpm_ctx.nodemask.len; i++) {
+ temp_buf_ptr = nodemask + i;
+ for (j = 0; j < 8; j++) {
+ if (*temp_buf_ptr & (1ULL << j))
+ seq_printf(s, "%d ", j + (i * 8));
+ }
+ }
+ seq_printf(s, "\n");
+ kfree(nodemask);
+ return 0;
+}
+
+static int deactive_set_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, deactive_set_show, inode->i_private);
+}
+
+static struct file_operations deactivation_set_file_ops = {
+ .owner = THIS_MODULE,
+ .open = deactive_set_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release
+};
+
+static int ospm_restart_show(struct seq_file *s, void *pos) {
+
+ uint64_t bid = (uint64_t)s->private;
+ mic_ctx_t *mic_ctx;
+ int err;
+
+ mic_ctx = get_per_dev_ctx(bid);
+ if (!mic_ctx) {
+ return -EINVAL;
+ }
+
+ err = pm_stop_device(mic_ctx);
+ if(err) {
+ seq_printf(s, "%s:%d\n", "Error calling pm_stop_device.", err);
+ return err;
+ }
+
+ err = pm_start_device(mic_ctx);
+ if(err) {
+ seq_printf(s, "%s:%d\n", "Error calling pm_start_device.", err);
+ return err;
+ }
+
+ return 0;
+}
+
+static int ospm_restart_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ospm_restart_show, inode->i_private);
+}
+
+static struct file_operations ospm_restart_file_ops = {
+ .owner = THIS_MODULE,
+ .open = ospm_restart_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release
+};
+
+static int testmsg_set(void *data, uint64_t value)
+{
+ uint64_t bid;
+ mic_ctx_t *mic_ctx;
+ int err;
+
+ bid = (uint64_t)data;
+ if (bid >= mic_data.dd_numdevs) {
+ return -EINVAL;
+ }
+
+ mic_ctx = get_per_dev_ctx(bid);
+ if (!mic_ctx) {
+ return -EINVAL;
+ }
+
+ if (value == 0) {
+ return -EINVAL;
+ }
+
+ err = mic_pm_send_msg(mic_ctx ,PM_MESSAGE_TEST, PM_TEST_MSG_BODY, sizeof(PM_TEST_MSG_BODY));
+ return err;
+}
+
+static int testmsg_get(void *data, uint64_t *value)
+{
+ uint64_t bid;
+ mic_ctx_t *mic_ctx;
+ int err;
+
+ bid = (uint64_t)data;
+ if (bid >= mic_data.dd_numdevs) {
+ return -EINVAL;
+ }
+
+ mic_ctx = get_per_dev_ctx(bid);
+ if (!mic_ctx) {
+ return -EINVAL;
+ }
+
+ err = check_test_msg(mic_ctx,PM_TEST_MSG_BODY, sizeof(PM_TEST_MSG_BODY));
+ *value = err;
+
+ return err;
+}
+DEFINE_SIMPLE_ATTRIBUTE(testmsg_fops, testmsg_get, testmsg_set, "%llu");
+
+int
+micpm_dbg_init(mic_ctx_t *mic_ctx)
+{
+ /* directory name will be in format micpmXXXXX
+ * so assuming the name string wont excceed 12 characters */
+ const uint32_t DBG_DIRNAME_LENGTH = 12;
+ char pmdbg_dir_name[DBG_DIRNAME_LENGTH];
+ micpm_ctx_t *micpm_ctx = &mic_ctx->micpm_ctx;
+ struct dentry *mmiodir;
+ uint64_t id = mic_ctx->bi_id;
+
+
+ if(!mic_data.dd_pm.pmdbgparent_dir) {
+ printk(KERN_ERR "%s: %d Parent debugfs directory does not exist.\n"
+ "debugfs may not be supported in kernel", __func__, __LINE__);
+ return -EOPNOTSUPP;
+ }
+
+ snprintf(pmdbg_dir_name, sizeof(pmdbg_dir_name), "micpm%d", mic_ctx->bi_id);
+ micpm_ctx->pmdbg_dir = debugfs_create_dir
+ (pmdbg_dir_name, mic_data.dd_pm.pmdbgparent_dir);
+ if (!micpm_ctx->pmdbg_dir) {
+ printk(KERN_ERR "%s: %d Failed in creating debugfs directory\n"
+ "debugfs may noe be supported in kernel", __func__, __LINE__);
+ return -EOPNOTSUPP;
+ }
+
+ /* Create debugfs entry to get/set idle state of the card known by host*/
+ debugfs_create_u32("idle_state", S_IRUGO | S_IWUSR, micpm_ctx->pmdbg_dir, &micpm_ctx->idle_state);
+
+ /*
+ * Create debugfs entry for sending PM_TEST_MESSAGE for testing communication to card
+ * set value = PM_MESSAGE_TEST to send the message to card
+ * get value to verfy that message was successfully sent, looped back by card and received.(0 = success)
+ */
+ debugfs_create_file("testmsg", S_IRUGO | S_IWUSR, micpm_ctx->pmdbg_dir, (void*)id, &testmsg_fops);
+
+ /* Create debugfs entry for showing for each node 'i' , all nodes 'j' i is dependent on */
+ debugfs_create_file("depgraph_i2j",
+ S_IRUGO,
+ micpm_ctx->pmdbg_dir,
+ (void*)id,
+ &depgraph_i2j_file_ops);
+
+ /* Create debugfs entry for showing for each node 'i', all nodes 'j' which are dependent on 'i' */
+ debugfs_create_file("depgraph_j2i",
+ S_IRUGO,
+ micpm_ctx->pmdbg_dir,
+ (void*)id,
+ &depgraph_j2i_file_ops);
+
+ /* Create debugfs entry for showing connection info for a node */
+ debugfs_create_file("connection_info",
+ S_IRUGO,
+ micpm_ctx->pmdbg_dir,
+ (void*)id,
+ &connection_info_file_ops);
+
+ /* Create debugfs entry to initiate OSPM restart for a node */
+ debugfs_create_file("ospm_restart",
+ S_IRUGO,
+ micpm_ctx->pmdbg_dir,
+ (void*)id,
+ &ospm_restart_file_ops);
+
+ /* Create debugfs entry to display activation set for a node */
+ debugfs_create_file("activation_set",
+ S_IRUGO,
+ micpm_ctx->pmdbg_dir,
+ (void*)id,
+ &activation_set_file_ops);
+
+ /* Create debugfs entry to display de-activation set for a node */
+ debugfs_create_file("deactivation_set",
+ S_IRUGO,
+ micpm_ctx->pmdbg_dir,
+ (void*)id,
+ &deactivation_set_file_ops);
+
+ /* Create debugfs entries for reading power management status/control register value in MMIO region */
+ mmiodir = debugfs_create_dir("mmio", micpm_ctx->pmdbg_dir);
+ if (!mmiodir) {
+ printk(KERN_ERR "%s: %d Failed in creating debugfs directory\n"
+ "debugfs may noe be supported in kernel", __func__, __LINE__);
+ return -EOPNOTSUPP;
+ }
+ debugfs_create_file("svidctrl", S_IRUGO, mmiodir,(void*)id, &fops_svidctrl);
+ debugfs_create_file("pcuctrl", S_IRUGO, mmiodir,(void*)id, &fops_pcuctrl);
+ debugfs_create_file("hoststate", S_IRUGO, mmiodir,(void*)id, &fops_hoststate);
+ debugfs_create_file("cardstate", S_IRUGO, mmiodir,(void*)id, &fops_cardstate);
+ debugfs_create_file("wtimer", S_IRUGO, mmiodir,(void*)id, &fops_wtimer);
+ debugfs_create_file("gpmctrl", S_IRUGO, mmiodir,(void*)id, &fops_gpmctrl);
+ debugfs_create_file("core_volt", S_IRUGO, mmiodir,(void*)id, &fops_core_volt);
+ debugfs_create_file("uos_pcuctrl", S_IRUGO, mmiodir,(void*)id, &fops_uos_pcuctrl);
+
+ return 0;
+}
+
+void micpm_dbg_parent_init(void) {
+ mic_data.dd_pm.pmdbgparent_dir = debugfs_create_dir("micpm", NULL);
+ if (!mic_data.dd_pm.pmdbgparent_dir) {
+ PM_DEBUG("%s: %d Failed in creating debugfs directory\n"
+ "debugfs may not be supported in kernel", __func__, __LINE__);
+ }
+
+ debugfs_create_u32("enable_pm_logging", S_IRUGO | S_IWUSR,
+ mic_data.dd_pm.pmdbgparent_dir, &mic_data.dd_pm.enable_pm_logging);
+
+ return;
+}
+
+
+/*
+ * Test message is looped back to driver and lives in the message list.
+ * This function retrieves the message and send it to user space which
+ * can check if its the same message as that was sent.
+ */
+static int
+check_test_msg(mic_ctx_t *mic_ctx, void *buf, uint32_t len)
+{
+ int err = -EINVAL;
+ pm_recv_msg_t *recv_msg = NULL;
+ struct list_head *pos = NULL, *tmpq = NULL;
+ bool msg_found = false;
+
+ if(len != sizeof(pm_msg_unit_test)) {
+ pr_debug("Invalid Args: Size of buffer\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&mic_ctx->micpm_ctx.msg_mutex);
+ if(!list_empty_careful(&mic_ctx->micpm_ctx.msg_list)) {
+ list_for_each_safe(pos, tmpq, &mic_ctx->micpm_ctx.msg_list) {
+ recv_msg = list_entry(pos, pm_recv_msg_t, msg);
+ /*Do not touch the message if its not a test message */
+ if (recv_msg->msg_header.opcode == PM_MESSAGE_TEST) {
+ list_del(&recv_msg->msg);
+ msg_found = true;
+ break;
+ }
+ }
+ } else {
+ pr_debug("empty message list \n");
+ goto no_msg;
+ }
+
+ if (msg_found == false) {
+ pr_debug("Test msg not found \n");
+ goto no_msg;
+ }
+
+ if(recv_msg->msg_body == NULL) {
+ pr_debug("Invalid source buffer\n");
+ goto list_free;
+ }
+
+ err = strncmp((char*)recv_msg->msg_body, (char*)buf, len);
+ kfree(recv_msg->msg_body);
+
+list_free:
+ kfree(recv_msg);
+
+no_msg:
+ mutex_unlock(&mic_ctx->micpm_ctx.msg_mutex);
+ return err;
+
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "mic_common.h"
+#include "scif.h"
+#include "mic/micscif.h"
+#include "mic/mic_pm.h"
+#include "mic/micveth_dma.h"
+#include <linux/virtio_ring.h>
+#include "linux/virtio_blk.h"
+#include "mic/mic_virtio.h"
+
+//few helper functions
+int pm_reg_read(mic_ctx_t *mic_ctx, uint32_t regoffset) {
+ uint32_t regval = 0;
+if (mic_ctx->bi_family == FAMILY_ABR)
+ regval = DBOX_READ(mic_ctx->mmio.va, regoffset);
+else if (mic_ctx->bi_family == FAMILY_KNC)
+ regval = SBOX_READ(mic_ctx->mmio.va, regoffset);
+
+ return regval;
+}
+
+int pm_reg_write(uint32_t value, mic_ctx_t *mic_ctx, uint32_t regoffset) {
+ int err = 0;
+if (mic_ctx->bi_family == FAMILY_ABR)
+ DBOX_WRITE(value, mic_ctx->mmio.va, regoffset);
+else if (mic_ctx->bi_family == FAMILY_KNC)
+ SBOX_WRITE(value, mic_ctx->mmio.va, regoffset);
+
+ return err;
+}
+
+int hw_idle(mic_ctx_t *mic_ctx) {
+
+ uint8_t is_ring_active;
+ sbox_pcu_ctrl_t ctrl_regval = {0};
+ uint32_t idle_wait_cnt;
+
+ for(idle_wait_cnt = 0; idle_wait_cnt <= MAX_HW_IDLE_WAIT_COUNT;
+ idle_wait_cnt++) {
+ ctrl_regval.value = pm_reg_read(mic_ctx,SBOX_PCU_CONTROL);
+ is_ring_active = ctrl_regval.bits.mclk_enabled;
+ if(likely(!is_ring_active))
+ return !is_ring_active;
+ msleep(1);
+ }
+
+ PM_DEBUG("Timing out waiting for HW to become idle\n");
+ return !is_ring_active;
+}
+
+int hw_active(mic_ctx_t *mic_ctx) {
+ uint8_t is_ring_active;
+ sbox_pcu_ctrl_t ctrl_regval;
+ uint32_t idle_wait_cnt;
+
+ for(idle_wait_cnt = 0; idle_wait_cnt <= MAX_HW_IDLE_WAIT_COUNT;
+ idle_wait_cnt++) {
+ ctrl_regval.value = pm_reg_read(mic_ctx,SBOX_PCU_CONTROL);
+ is_ring_active = ctrl_regval.bits.mclk_enabled;
+ if (likely(is_ring_active))
+ return is_ring_active;
+ msleep(10);
+ }
+
+ PM_DEBUG("Timing out waiting for HW to become active\n");
+ return is_ring_active;
+
+}
+
+PM_IDLE_STATE get_card_state(mic_ctx_t *mic_ctx) {
+
+ PM_IDLE_STATE state;
+ sbox_uos_pm_state_t upmstate_regval = {0};
+ upmstate_regval.value = pm_reg_read(mic_ctx, SBOX_UOS_PMSTATE);
+ state = (PM_IDLE_STATE)(upmstate_regval.bits.uos_pm_state);
+ return state;
+
+}
+
+PM_IDLE_STATE get_host_state(mic_ctx_t *mic_ctx) {
+
+ PM_IDLE_STATE state;
+ sbox_host_pm_state_t hpmstate_regval = {0};
+ hpmstate_regval.value = pm_reg_read(mic_ctx, SBOX_HOST_PMSTATE);
+ state = (PM_IDLE_STATE)(hpmstate_regval.bits.host_pm_state);
+ return state;
+
+}
+
+int set_host_state(mic_ctx_t *mic_ctx, PM_IDLE_STATE state) {
+
+ int err = 0;
+ sbox_host_pm_state_t hpmstate_regval = {0};
+ hpmstate_regval.value = pm_reg_read(mic_ctx, SBOX_HOST_PMSTATE);
+ hpmstate_regval.bits.host_pm_state = 0;
+ hpmstate_regval.bits.host_pm_state = state;
+ pm_reg_write(hpmstate_regval.value, mic_ctx, SBOX_HOST_PMSTATE);
+ return err;
+}
+
+int check_card_state(mic_ctx_t *mic_ctx, PM_IDLE_STATE state) {
+ PM_IDLE_STATE card_state = get_card_state(mic_ctx);
+ return (state == card_state) ? 1 : 0;
+}
+
+int check_host_state(mic_ctx_t *mic_ctx, PM_IDLE_STATE state) {
+ PM_IDLE_STATE host_state = get_host_state(mic_ctx);
+ return (state == host_state) ? 1 : 0;
+}
+
+uint32_t svid_cmd_fmt(unsigned int bits)
+{
+ unsigned int bits_set,bmask;
+
+ bmask = bits;
+
+ for (bits_set = 0; bmask; bits_set++) {
+ /* Zero the least significant bit that is set */
+ bmask &= (bmask - 1);
+ }
+ bits <<= 1; /* Make way for the parity bit */
+ if (bits_set & 1) { /* odd number of 1s */
+ bits |= 1;
+ }
+
+ return bits;
+}
+
+void set_vid(mic_ctx_t *mic_ctx, sbox_svid_control svidctrl_regval, unsigned int vidcode) {
+
+ uint32_t temp;
+ uint32_t svid_cmd = 0;
+ uint32_t svid_dout = 0;
+ temp = svid_cmd_fmt((KNC_SVID_ADDR << 13) |
+ (KNC_SETVID_SLOW << 8) | vidcode);
+ svid_cmd = (KNC_SVID_ADDR << 5) | KNC_SETVID_SLOW;
+ svidctrl_regval.bits.svid_cmd = 0x0e0;
+ svidctrl_regval.bits.svid_cmd = svid_cmd;
+
+ svid_dout = temp & 0x1ff;
+ svidctrl_regval.bits.svid_dout = 0;
+ svidctrl_regval.bits.svid_dout = svid_dout;
+
+ svidctrl_regval.bits.cmd_start = 0x1;
+ pm_reg_write(svidctrl_regval.value, mic_ctx,
+ SBOX_SVID_CONTROL);
+
+ msleep(10);
+
+ return;
+}
+
+int set_vid_knc(mic_ctx_t *mic_ctx, unsigned int vidcode)
+{
+ uint32_t status = 0;
+
+ sbox_svid_control svidctrl_regval = {0};
+ uint32_t svid_idle = 0;
+ uint32_t svid_error = 0;
+ int i = 0;
+ uint32_t wait_cnt = 0;
+ sbox_core_volt_t core_volt_regval = {0};
+ int retry = 0;
+
+ if (mic_ctx->bi_stepping >= KNC_B0_STEP) {
+ for (retry = 0; retry < SET_VID_RETRY_COUNT; retry++) {
+ status = 0;
+ for (i = 0; i < KNC_SETVID_ATTEMPTS; i++) {
+ svidctrl_regval.value = pm_reg_read(mic_ctx,SBOX_SVID_CONTROL);
+ svid_idle = svidctrl_regval.bits.svid_idle;
+
+ if (svid_idle) {
+ set_vid(mic_ctx, svidctrl_regval, vidcode);
+ svidctrl_regval.value =
+ pm_reg_read(mic_ctx,SBOX_SVID_CONTROL);
+ svid_idle = svidctrl_regval.bits.svid_idle;
+ svid_error = svidctrl_regval.bits.svid_error;
+
+ if (!svid_idle) {
+ printk(KERN_ERR "%s SVID command failed - Idle not set\n",
+ __func__);
+ msleep(10);
+ continue;
+ }
+
+ if (svid_error) {
+ if (SBOX_SVIDCTRL_ACK1ACK0(svidctrl_regval.value) == 0x2) {
+ printk(KERN_ERR "%s SVID command failed - rx parity error\n",
+ __func__);
+ } else {
+ printk(KERN_ERR "%s SVID command failed - tx parity error\n",
+ __func__);
+ }
+ status = -EINVAL;
+ goto exit;
+ } else {
+ PM_DEBUG("SVID Command Successful - VID set to %d\n",vidcode);
+ break;
+ }
+ }
+ }
+
+ if (i == KNC_SETVID_ATTEMPTS) {
+ printk(KERN_ERR "%s Timed out waiting for SVID idle\n", __func__);
+ status = -EINVAL;
+ goto exit;
+ }
+
+ /* Verify that the voltage is set */
+ for(wait_cnt = 0; wait_cnt <= 100; wait_cnt++) {
+ core_volt_regval.value = pm_reg_read(mic_ctx, SBOX_COREVOLT);
+ if(vidcode == core_volt_regval.bits.vid) {
+ return status;
+ }
+ msleep(10);
+ PM_DEBUG("Retry: %d Voltage not set yet. vidcode = 0x%x Current vid = 0x%x\n",
+ retry, vidcode, core_volt_regval.bits.vid);
+ }
+
+ PM_PRINT("Retry: %d Failed to set vid for node %d. vid code = 0x%x Current vid = 0x%x.\n",
+ retry, mic_get_scifnode_id(mic_ctx), vidcode, core_volt_regval.bits.vid);
+ status = -ENODEV;
+ }
+ } else {
+ set_vid(mic_ctx, svidctrl_regval, vidcode);
+
+ /* SBOX_COREVOLT does not reflect the correct vid
+ * value on A0. Just wait here for sometime to
+ * allow for the vid to be set.
+ */
+ msleep(20);
+ }
+
+exit:
+ return status;
+}
+
+/* @print_nodemaskbuf
+ *
+ * @param - buf - the nodemask buffer
+ *
+ * prints the nodes in the nodemask.
+ *
+ * @returns - none
+ */
+void print_nodemaskbuf(uint8_t* buf) {
+
+ uint8_t *temp_buf_ptr;
+ uint32_t i,j;
+
+ temp_buf_ptr = buf;
+ PM_DEBUG("Nodes in nodemask: ");
+ for(i = 0; i <= ms_info.mi_maxid; i++) {
+ temp_buf_ptr = buf + i;
+ for (j = 0; j < 8; j++) {
+ if (get_nodemask_bit(temp_buf_ptr, j))
+ pr_debug("%d ", j + (i * 8));
+ }
+ }
+}
+
+void restore_pc6_registers(mic_ctx_t *mic_ctx, bool from_dpc3) {
+ sbox_pcu_ctrl_t ctrl_regval = {0};
+ sbox_uos_pcu_ctrl_t uos_ctrl_regval = {0};
+ gbox_pm_control pmctrl_reg = {0};
+ sbox_core_freq_t core_freq_reg = {0};
+
+ if (!from_dpc3) {
+ if(KNC_A_STEP == mic_ctx->bi_stepping) {
+ ctrl_regval.value = pm_reg_read(mic_ctx, SBOX_PCU_CONTROL);
+ ctrl_regval.bits.enable_mclk_pl_shutdown = 0;
+ pm_reg_write(ctrl_regval.value, mic_ctx, SBOX_PCU_CONTROL);
+ } else {
+ uos_ctrl_regval.value = pm_reg_read(mic_ctx,SBOX_UOS_PCUCONTROL);
+ uos_ctrl_regval.bits.enable_mclk_pll_shutdown = 0;
+ pm_reg_write(uos_ctrl_regval.value, mic_ctx, SBOX_UOS_PCUCONTROL);
+ }
+
+
+ ctrl_regval.value = pm_reg_read(mic_ctx, SBOX_PCU_CONTROL);
+ ctrl_regval.bits.prevent_auto_c3_exit = 0;
+ pm_reg_write(ctrl_regval.value, mic_ctx, SBOX_PCU_CONTROL);
+ }
+
+ pmctrl_reg.value = pm_reg_read(mic_ctx, GBOX_PM_CTRL);
+ pmctrl_reg.bits.in_pckgc6 = 0;
+ pm_reg_write(pmctrl_reg.value, mic_ctx, GBOX_PM_CTRL);
+
+ ctrl_regval.value = pm_reg_read(mic_ctx, SBOX_PCU_CONTROL);
+ ctrl_regval.bits.grpB_pwrgood_mask = 0;
+ pm_reg_write(ctrl_regval.value, mic_ctx, SBOX_PCU_CONTROL);
+
+ core_freq_reg.value = pm_reg_read(mic_ctx, SBOX_COREFREQ);
+ core_freq_reg.bits.booted = 1;
+ pm_reg_write(core_freq_reg.value, mic_ctx, SBOX_COREFREQ);
+}
+
+void program_mclk_shutdown(mic_ctx_t *mic_ctx, bool set)
+{
+ sbox_uos_pcu_ctrl_t uos_ctrl_regval;
+ sbox_pcu_ctrl_t ctrl_regval;
+
+ if(KNC_A_STEP == mic_ctx->bi_stepping) {
+ ctrl_regval.value = pm_reg_read(mic_ctx,SBOX_PCU_CONTROL);
+ ctrl_regval.bits.enable_mclk_pl_shutdown = (set ? 1: 0);
+ pm_reg_write(ctrl_regval.value, mic_ctx, SBOX_PCU_CONTROL);
+ } else {
+ uos_ctrl_regval.value = pm_reg_read(mic_ctx,
+ SBOX_UOS_PCUCONTROL);
+ uos_ctrl_regval.bits.enable_mclk_pll_shutdown = (set ? 1: 0);
+ pm_reg_write(uos_ctrl_regval.value,
+ mic_ctx, SBOX_UOS_PCUCONTROL);
+ }
+}
+
+void program_prevent_C3Exit(mic_ctx_t *mic_ctx, bool set)
+{
+ sbox_pcu_ctrl_t ctrl_regval;
+ ctrl_regval.value = pm_reg_read(mic_ctx,SBOX_PCU_CONTROL);
+ ctrl_regval.bits.prevent_auto_c3_exit = (set ? 1: 0);
+ pm_reg_write(ctrl_regval.value, mic_ctx, SBOX_PCU_CONTROL);
+
+}
+
+int pm_pc3_to_pc6_entry(mic_ctx_t *mic_ctx)
+{
+ int err;
+ sbox_pcu_ctrl_t ctrl_regval;
+ gbox_pm_control pmctrl_reg;
+ sbox_core_freq_t core_freq_reg;
+
+ if ((get_card_state(mic_ctx)) != PM_IDLE_STATE_PC3) {
+ PM_DEBUG("Card not ready to go to PC6. \n");
+ err = -EAGAIN;
+ goto exit;
+ }
+
+ if (atomic_cmpxchg(&mic_ctx->gate_interrupt, 0, 1) == 1) {
+ PM_DEBUG("Cannot gate interrupt handler while it is in use\n");
+ err = -EFAULT;
+ goto exit;
+ }
+
+ program_prevent_C3Exit(mic_ctx, true);
+ program_mclk_shutdown(mic_ctx, true);
+
+ /* Wait for uos to become idle. */
+ if (!hw_idle(mic_ctx)) {
+ program_mclk_shutdown(mic_ctx, false);
+ if (!hw_idle(mic_ctx)) {
+ program_prevent_C3Exit(mic_ctx, false);
+ PM_DEBUG("Card not ready to go to PC6. \n");
+ err = -EAGAIN;
+ goto intr_ungate;
+ } else {
+ program_mclk_shutdown(mic_ctx, true);
+ }
+ }
+
+ pmctrl_reg.value = pm_reg_read(mic_ctx, GBOX_PM_CTRL);
+ pmctrl_reg.bits.in_pckgc6 = 1;
+ pm_reg_write(pmctrl_reg.value, mic_ctx, GBOX_PM_CTRL);
+
+ core_freq_reg.value = pm_reg_read(mic_ctx, SBOX_COREFREQ);
+ core_freq_reg.bits.booted = 0;
+ pm_reg_write(core_freq_reg.value, mic_ctx, SBOX_COREFREQ);
+
+ udelay(500);
+
+ ctrl_regval.value = pm_reg_read(mic_ctx, SBOX_PCU_CONTROL);
+ ctrl_regval.bits.grpB_pwrgood_mask = 1;
+ pm_reg_write(ctrl_regval.value, mic_ctx, SBOX_PCU_CONTROL);
+
+ err = set_vid_knc(mic_ctx, 0);
+ if (err != 0) {
+ PM_DEBUG("Aborting PC6 entry...Failed to set VID\n");
+ restore_pc6_registers(mic_ctx, true);
+ goto intr_ungate;
+ }
+
+ mic_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_PC6;
+ set_host_state(mic_ctx, PM_IDLE_STATE_PC6);
+
+ dma_prep_suspend(mic_ctx->dma_handle);
+
+ PM_PRINT("Node %d entered PC6\n",
+ mic_get_scifnode_id(mic_ctx));
+
+ return err;
+
+intr_ungate:
+ atomic_set(&mic_ctx->gate_interrupt, 0);
+ tasklet_schedule(&mic_ctx->bi_dpc);
+exit:
+ return err;
+}
+
+/*
+ * pm_pc6_exit:
+ *
+ * Execute pc6 exit for a node.
+ * mic_ctx: The driver context of the node.
+ */
+int pm_pc6_exit(mic_ctx_t *mic_ctx)
+{
+
+ int err = 0;
+
+ sbox_host_pm_state_t hpmstate_regval;
+ sbox_pcu_ctrl_t ctrl_regval;
+ uint8_t tdp_vid = 0;
+ uint8_t is_pll_locked;
+ uint32_t wait_cnt;
+ int i;
+
+
+ if (!check_host_state(mic_ctx, PM_IDLE_STATE_PC6)) {
+ PM_DEBUG("Wrong Host PM state. State = %d\n",
+ get_host_state(mic_ctx));
+ err = -EINVAL;
+ goto restore_registers;
+ }
+
+ hpmstate_regval.value = pm_reg_read(mic_ctx, SBOX_HOST_PMSTATE);
+ tdp_vid = hpmstate_regval.bits.tdp_vid;
+ PM_DEBUG("TDP_VID value obtained from Host PM Register = %d",tdp_vid);
+
+ PM_DEBUG("Setting voltage to %dV using SVID Control\n",tdp_vid);
+ err = set_vid_knc(mic_ctx, tdp_vid);
+ if (err != 0) {
+ printk(KERN_ERR "%s Failed PC6 entry...error in setting VID\n",
+ __func__);
+ goto restore_registers;
+ }
+
+ ctrl_regval.value = pm_reg_read(mic_ctx, SBOX_PCU_CONTROL);
+
+ program_mclk_shutdown(mic_ctx, false);
+ program_prevent_C3Exit(mic_ctx, false);
+
+ for(wait_cnt = 0; wait_cnt < 200; wait_cnt++) {
+ ctrl_regval.value = pm_reg_read(mic_ctx,SBOX_PCU_CONTROL);
+ is_pll_locked = ctrl_regval.bits.mclk_pll_lock;
+ if(likely(is_pll_locked))
+ break;
+ msleep(10);
+ }
+
+ if(wait_cnt >= 200) {
+ PM_DEBUG("mclk_pll_locked bit is not set.\n");
+ err = -EAGAIN;
+ goto restore_registers;
+ }
+
+ ctrl_regval.bits.grpB_pwrgood_mask = 0;
+ pm_reg_write(ctrl_regval.value, mic_ctx, SBOX_PCU_CONTROL);
+
+ if (!hw_active(mic_ctx)) {
+ PM_DEBUG("Timing out waiting for hw to become active");
+ goto restore_registers;
+ }
+
+ for(wait_cnt = 0; wait_cnt < 200; wait_cnt++) {
+ if ((get_card_state(mic_ctx)) == PM_IDLE_STATE_PC0)
+ break;
+ msleep(10);
+ }
+
+ if(wait_cnt >= 200) {
+ PM_DEBUG("PC6 Exit not complete.\n");
+ err = -EFAULT;
+ goto restore_registers;
+ }
+
+ mic_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_PC0;
+
+ for (i = 0; i <= mic_data.dd_numdevs; i++) {
+ if (micscif_get_nodedep(mic_get_scifnode_id(mic_ctx), i) ==
+ DEP_STATE_DISCONNECTED) {
+ micscif_set_nodedep(mic_get_scifnode_id(mic_ctx), i,
+ DEP_STATE_DEPENDENT);
+ }
+ }
+
+ PM_PRINT("Node %d exited PC6\n",
+ mic_get_scifnode_id(mic_ctx));
+ goto exit;
+
+restore_registers:
+ restore_pc6_registers(mic_ctx, false);
+exit:
+ atomic_set(&mic_ctx->gate_interrupt, 0);
+ tasklet_schedule(&mic_ctx->bi_dpc);
+ return err;
+}
+
+/*
+ * setup_pm_dependency:
+ *
+ * Function sets up the dependency matrix by populating
+ * the matrix with node depency information.
+ *
+ * Returns 0 on success. Appropriate error on failure.
+ */
+int setup_pm_dependency(void){
+ int err = 0;
+ uint16_t i;
+ uint16_t j;
+ mic_ctx_t *mic_ctx;
+
+ for (i = 0; i < mic_data.dd_numdevs; i++) {
+ mic_ctx = get_per_dev_ctx(i);
+ if (!mic_ctx) {
+ PM_DEBUG("Failed to retrieve driver context\n");
+ return -EFAULT;
+ }
+ if (mic_ctx->micpm_ctx.idle_state ==
+ PM_IDLE_STATE_PC3_READY) {
+ for (j = 0; j < mic_data.dd_numdevs; j++) {
+ if (micscif_get_nodedep(mic_get_scifnode_id(mic_ctx),j+1) ==
+ DEP_STATE_DEPENDENT) {
+ micscif_set_nodedep(mic_get_scifnode_id(mic_ctx),j+1,
+ DEP_STATE_DISCONNECT_READY);
+ }
+ }
+ }
+ }
+ return err;
+}
+
+/*
+ * teardown_pm_dependency
+ *
+ * Function resets dependency matrix by removing all depenendy info
+ * from it.
+ *
+ * Returns 0 on success. Appropriate error on failure.
+ */
+int teardown_pm_dependency(void) {
+ int err = 0;
+ int i;
+ int j;
+
+ for (i = 0; i < mic_data.dd_numdevs; i++) {
+ for (j = 0; j < mic_data.dd_numdevs; j++) {
+
+ if (micscif_get_nodedep(i+1,j+1) == DEP_STATE_DISCONNECT_READY) {
+ micscif_set_nodedep(i+1,j+1, DEP_STATE_DEPENDENT);
+ }
+ }
+ }
+ return err;
+}
+
+/*
+ * revert_idle_entry_trasaction:
+ *
+ * @node_disc_bitmask: Bitmask of nodes which were involved in the
+ * transaction
+ *
+ * Function Reverts idle state changes made to nodes when an idle
+ * state trasaction fails.
+ */
+int revert_idle_entry_trasaction(uint8_t *node_disc_bitmask) {
+ int err = 0;
+ mic_ctx_t *node_ctx;
+ uint32_t node_id = 0;
+
+ for(node_id = 0; node_id <= ms_info.mi_maxid; node_id++) {
+ if (node_id == SCIF_HOST_NODE)
+ continue;
+
+ if (!get_nodemask_bit(node_disc_bitmask, node_id))
+ continue;
+
+ node_ctx = get_per_dev_ctx(node_id - 1);
+ if (!node_ctx) {
+ PM_DEBUG("Failed to retrieve node context.");
+ err = -EINVAL;
+ goto exit;
+ }
+
+ if (node_ctx->micpm_ctx.idle_state == PM_IDLE_STATE_PC3) {
+ err = pm_pc3_exit(node_ctx);
+ if (err) {
+ PM_DEBUG("Wakeup of Node %d failed. Node is lost"
+ " and is to be disconnected",node_id);
+ node_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_LOST;
+ /* Since node is lost, ref_cnt increment(decement) through the
+ * pm_get(put)_reference interface is prevented by idle_state.
+ * We still need to ensure the ref_cnt iself is reset
+ * back to 0 so that pm_get(put)_reference will work after the
+ * lost node interface recovers the node. */
+ atomic_set(&node_ctx->micpm_ctx.pm_ref_cnt, 0);
+ }
+ }
+ }
+exit:
+ return err;
+}
+
+/* pm_node_disconnect
+ *
+ * Called during idlestate entry.
+ *
+ * Function checks the pm_ref_cnt and returns ACK
+ * or NACK depending on the pm_ref_cnt value.
+ */
+int pm_node_disconnect(uint8_t *nodemask) {
+
+ uint32_t node_id;
+ mic_ctx_t *mic_ctx;
+ int ret = 0;
+ int err = 0;
+
+ for (node_id = 0; node_id <= ms_info.mi_maxid; node_id++) {
+ if (node_id == SCIF_HOST_NODE)
+ continue;
+
+ if (!get_nodemask_bit(nodemask, node_id))
+ continue;
+
+ mic_ctx = get_per_dev_ctx(node_id - 1);
+ if (!mic_ctx) {
+ set_nodemask_bit(nodemask, node_id, 0);
+ return -EAGAIN;
+ }
+
+ if (mic_ctx->state != MIC_ONLINE) {
+ set_nodemask_bit(nodemask, node_id, 0);
+ return -EAGAIN;
+ }
+
+ ret = atomic_cmpxchg(&mic_ctx->micpm_ctx.pm_ref_cnt,
+ 0, PM_NODE_IDLE);
+ if (((ret != 0) && (ret != PM_NODE_IDLE))
+ || atomic_read(&mic_data.dd_pm.wakeup_in_progress)) {
+ set_nodemask_bit(nodemask, node_id, 0);
+ return -EAGAIN;
+ }
+ }
+
+ return err;
+}
+
+/*
+ * pm_pc3_entry:
+ *
+ * Execute pc3 entry for a node.
+ * mic_ctx: The driver context of the node.
+ */
+int pm_pc3_entry(mic_ctx_t *mic_ctx)
+{
+ int err = 0;
+ if (mic_ctx == NULL) {
+ err = -EINVAL;
+ goto exit;
+ }
+
+ if (((!check_host_state(mic_ctx, PM_IDLE_STATE_PC0))) ||
+ (mic_ctx->micpm_ctx.idle_state != PM_IDLE_STATE_PC0)) {
+ PM_DEBUG("Wrong host state. register state = %d"
+ " idle state = %d\n", get_host_state(mic_ctx),
+ mic_ctx->micpm_ctx.idle_state);
+ goto send_wakeup;
+ }
+
+ /* cancel pc6 entry work that may be scheduled. We need to
+ * do this either here or after a pervious pc3 exit */
+ cancel_delayed_work_sync(&mic_ctx->micpm_ctx.pc6_entry_work);
+
+ if ((mic_ctx->micpm_ctx.con_state != PM_CONNECTED) ||
+ (!mic_ctx->micpm_ctx.pc3_enabled))
+ goto send_wakeup;
+
+ mic_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_PC3_READY;
+ err = do_idlestate_entry(mic_ctx);
+ if (err)
+ goto exit;
+ if ((mic_ctx->micpm_ctx.pc6_enabled) &&
+ (KNC_C_STEP <= mic_ctx->bi_stepping) &&
+ (KNC_B1_STEP != mic_ctx->bi_stepping)) {
+ queue_delayed_work(mic_ctx->micpm_ctx.pc6_entry_wq,
+ &mic_ctx->micpm_ctx.pc6_entry_work,
+ mic_ctx->micpm_ctx.pc6_timeout*HZ);
+ }
+
+ goto exit;
+
+send_wakeup:
+ mutex_lock(&mic_data.dd_pm.pm_idle_mutex);
+ pm_pc3_exit(mic_ctx);
+ mutex_unlock(&mic_data.dd_pm.pm_idle_mutex);
+exit:
+ return err;
+}
+
+/*
+ * pm_pc3_exit:
+ * Calling function needs to grab idle_state mutex.
+ *
+ * Execute pc3 exit for a node.
+ * mic_ctx: The driver context of the node.
+ */
+int pm_pc3_exit(mic_ctx_t *mic_ctx)
+{
+ int err;
+ int wait_cnt;
+
+ WARN_ON(!mutex_is_locked(&mic_data.dd_pm.pm_idle_mutex));
+ mic_send_pm_intr(mic_ctx);
+ for (wait_cnt = 0; wait_cnt < PC3_EXIT_WAIT_COUNT; wait_cnt++) {
+ if (check_card_state(mic_ctx, PM_IDLE_STATE_PC0))
+ break;
+ msleep(1);
+ }
+
+
+ if(wait_cnt >= PC3_EXIT_WAIT_COUNT) {
+ PM_DEBUG("Syncronization with card failed."
+ " Node is lost\n");
+ err = -EFAULT;
+ goto exit;
+ }
+
+ set_host_state(mic_ctx, PM_IDLE_STATE_PC0);
+ mic_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_PC0;
+ PM_DEBUG("Node %d exited PC3\n", mic_get_scifnode_id(mic_ctx));
+
+ return 0;
+exit:
+ return err;
+}
+
+/*
+ * do_idlestate_entry:
+ *
+ * Function to start the idle state entry transaction for a node. Puts a node
+ * and all the nodes that are dependent on this node to idle state if
+ * it is possible.
+ *
+ * mic_ctx: The device context of node that needs to be put in idle state
+ * Returs 0 in success. Appropriate error code on failure
+ */
+int do_idlestate_entry(mic_ctx_t *mic_ctx)
+{
+ int err = 0;
+ uint32_t node_id = 0;
+ mic_ctx_t *node_ctx;
+ uint8_t *nodemask_buf;
+
+ if(!mic_ctx)
+ return -EINVAL;
+
+ mutex_lock(&mic_data.dd_pm.pm_idle_mutex);
+
+ if ((err = setup_pm_dependency())) {
+ PM_DEBUG("Failed to set up PM specific dependencies");
+ goto unlock;
+ }
+
+ nodemask_buf = (uint8_t *)
+ kzalloc(mic_ctx->micpm_ctx.nodemask.len, GFP_KERNEL);
+ if(!nodemask_buf) {
+ PM_DEBUG("Error allocating nodemask buffer\n");
+ err = ENOMEM;
+ goto dep_teardown;
+ }
+
+ err = micscif_get_deactiveset(mic_get_scifnode_id(mic_ctx),
+ nodemask_buf, 1);
+ if (err) {
+ PM_DEBUG("Node disconnection failed "
+ "during deactivation set calculation");
+ goto free_buf;
+ }
+
+ print_nodemaskbuf(nodemask_buf);
+
+ if ((err = micscif_disconnect_node(mic_get_scifnode_id(mic_ctx),
+ nodemask_buf, DISCONN_TYPE_POWER_MGMT))) {
+ PM_DEBUG("SCIF Node disconnect failed. err: %d", err);
+ goto free_buf;
+ }
+
+ if ((err = pm_node_disconnect(nodemask_buf))) {
+ PM_DEBUG("PM Node disconnect failed. err = %d\n", err);
+ goto free_buf;
+ }
+
+ if ((err = micvcons_pm_disconnect_node(nodemask_buf,
+ DISCONN_TYPE_POWER_MGMT))) {
+ PM_DEBUG("VCONS Node disconnect failed. err = %d\n", err);
+ goto free_buf;
+ }
+
+ for(node_id = 0; node_id <= ms_info.mi_maxid; node_id++) {
+ if (node_id == SCIF_HOST_NODE)
+ continue;
+ if (!get_nodemask_bit(nodemask_buf, node_id))
+ continue;
+ node_ctx = get_per_dev_ctx(node_id - 1);
+ if (!node_ctx) {
+ PM_DEBUG("Failed to retrieve node context.");
+ err = -EINVAL;
+ goto revert;
+ }
+
+ if (node_ctx->micpm_ctx.idle_state ==
+ PM_IDLE_STATE_PC3_READY) {
+ set_host_state(node_ctx, PM_IDLE_STATE_PC3);
+ node_ctx->micpm_ctx.idle_state =
+ PM_IDLE_STATE_PC3;
+ PM_DEBUG("Node %d entered PC3\n",
+ mic_get_scifnode_id(node_ctx));
+ } else {
+ PM_DEBUG("Invalid idle state \n");
+ err = -EINVAL;
+ goto revert;
+ }
+ }
+
+revert:
+ if (err)
+ revert_idle_entry_trasaction(nodemask_buf);
+free_buf:
+ kfree(nodemask_buf);
+dep_teardown:
+ teardown_pm_dependency();
+unlock:
+ if (err && (mic_ctx->micpm_ctx.idle_state != PM_IDLE_STATE_PC0))
+ pm_pc3_exit(mic_ctx);
+
+ mutex_unlock(&mic_data.dd_pm.pm_idle_mutex);
+ return err;
+}
+
+/**
+ * is_idlestate_exit_needed:
+ *
+ * @node_id[in]: node to wakeup.
+ *
+ * Method responsible for checking if idle state exit is required
+ * In some situation we would like to know whether node is idle or not before
+ * making decision to bring the node out of idle state.
+ * For example - Lost node detection.
+ * returns false if the node is not in IDLE state, returns true otherwise
+ */
+int
+is_idlestate_exit_needed(mic_ctx_t *mic_ctx)
+{
+ int ret = 0;
+ mutex_lock(&mic_data.dd_pm.pm_idle_mutex);
+
+ switch (mic_ctx->micpm_ctx.idle_state)
+ {
+ case PM_IDLE_STATE_PC0:
+ case PM_IDLE_STATE_LOST:
+ break;
+ case PM_IDLE_STATE_PC3:
+ case PM_IDLE_STATE_PC3_READY:
+ case PM_IDLE_STATE_PC6:
+ {
+ ret = 1;
+ break;
+ }
+ default:
+ ret = 1;
+ }
+
+ mutex_unlock(&mic_data.dd_pm.pm_idle_mutex);
+ return ret;
+}
+
+/* do_idlestate_exit:
+ *
+ * Initiate idle state exits for nodes specified
+ * by the bitmask.
+ *
+ * mic_ctx: The device context.
+ * get_ref: Set to true if the entity that wants to wake
+ * a node up also wantes to get a reference to the node.
+ *
+ * Returs 0 on success. Appropriate error on failure.
+ *
+ */
+int do_idlestate_exit(mic_ctx_t *mic_ctx, bool get_ref) {
+ int err = 0;
+ uint32_t node_id = 0;
+ mic_ctx_t *node_ctx;
+ uint8_t *nodemask_buf;
+
+ if(!mic_ctx)
+ return -EINVAL;
+
+ might_sleep();
+ /* If the idle_state_mutex is already obtained by another thread
+ * try to wakeup the thread which MAY be waiting for REMOVE_NODE
+ * responses. This way, we give priority to idle state exits than
+ * idle state entries.
+ */
+ if (!mutex_trylock(&mic_data.dd_pm.pm_idle_mutex)) {
+ atomic_inc(&mic_data.dd_pm.wakeup_in_progress);
+ wake_up(&ms_info.mi_disconn_wq);
+ mutex_lock(&mic_data.dd_pm.pm_idle_mutex);
+ atomic_dec(&mic_data.dd_pm.wakeup_in_progress);
+ }
+
+ nodemask_buf = (uint8_t *)kzalloc(mic_ctx->micpm_ctx.nodemask.len, GFP_KERNEL);
+ if(!nodemask_buf) {
+ PM_DEBUG("Error allocating nodemask buffer\n");
+ mutex_unlock(&mic_data.dd_pm.pm_idle_mutex);
+ err = ENOMEM;
+ goto abort_node_wake;
+ }
+
+ if ((err = micscif_get_activeset(mic_get_scifnode_id(mic_ctx), nodemask_buf))) {
+ PM_DEBUG("Node connect failed during Activation set calculation for node\n");
+ mutex_unlock(&mic_data.dd_pm.pm_idle_mutex);
+ err = -EINVAL;
+ goto free_buf;
+ }
+
+ print_nodemaskbuf(nodemask_buf);
+
+ for(node_id = 0; node_id <= ms_info.mi_maxid; node_id++) {
+ if (node_id == SCIF_HOST_NODE)
+ continue;
+
+ if (!get_nodemask_bit(nodemask_buf, node_id))
+ continue;
+
+ node_ctx = get_per_dev_ctx(node_id - 1);
+ if (!node_ctx) {
+ mutex_unlock(&mic_data.dd_pm.pm_idle_mutex);
+ goto free_buf;
+ }
+
+ switch (node_ctx->micpm_ctx.idle_state) {
+ case PM_IDLE_STATE_PC3:
+ case PM_IDLE_STATE_PC3_READY:
+ if ((err = pm_pc3_exit(node_ctx))) {
+ PM_DEBUG("Wakeup of Node %d failed."
+ "Node to be disconnected",node_id);
+ set_nodemask_bit(nodemask_buf, node_id, 0);
+ node_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_LOST;
+ /* Since node is lost, ref_cnt increment(decement) through the
+ * pm_get(put)_reference interface is prevented by idle_state.
+ * We still need to ensure the ref_cnt iself is reset
+ * back to 0 so that pm_get(put)_reference will work after the
+ * lost node interface recovers the node. */
+ atomic_set(&node_ctx->micpm_ctx.pm_ref_cnt, 0);
+ } else {
+ if ((mic_ctx == node_ctx) && get_ref)
+ if (atomic_cmpxchg(&mic_ctx->micpm_ctx.pm_ref_cnt, PM_NODE_IDLE, 1) !=
+ PM_NODE_IDLE)
+ atomic_inc(&mic_ctx->micpm_ctx.pm_ref_cnt);
+ }
+ break;
+ case PM_IDLE_STATE_PC6:
+ if ((err = pm_pc6_exit(node_ctx))) {
+ PM_DEBUG("Wakeup of Node %d failed."
+ "Node to be disconnected",node_id);
+ set_nodemask_bit(nodemask_buf, node_id, 0);
+ node_ctx->micpm_ctx.idle_state = PM_IDLE_STATE_LOST;
+ /* Since node is lost, ref_cnt increment(decement) through the
+ * pm_get(put)_reference interface is prevented by idle_state.
+ * We still need to ensure the ref_cnt iself is reset
+ * back to 0 so that pm_get(put)_reference will work after the
+ * lost node interface recovers the node. */
+ atomic_set(&node_ctx->micpm_ctx.pm_ref_cnt, 0);
+ } else {
+ if ((mic_ctx == node_ctx) && get_ref)
+ if (atomic_cmpxchg(&mic_ctx->micpm_ctx.pm_ref_cnt, PM_NODE_IDLE, 1) !=
+ PM_NODE_IDLE)
+ atomic_inc(&mic_ctx->micpm_ctx.pm_ref_cnt);
+ }
+ break;
+ case PM_IDLE_STATE_PC0:
+ PM_DEBUG("Node %d is in state %d "
+ "and already out of package state.\n",node_id,
+ node_ctx->micpm_ctx.idle_state);
+ if ((mic_ctx == node_ctx) && get_ref)
+ if (atomic_cmpxchg(&mic_ctx->micpm_ctx.pm_ref_cnt, PM_NODE_IDLE, 1) !=
+ PM_NODE_IDLE)
+ atomic_inc(&mic_ctx->micpm_ctx.pm_ref_cnt);
+ break;
+ default:
+ PM_DEBUG("Invalid idle state of node %d."
+ " State = %d \n", node_id,
+ node_ctx->micpm_ctx.idle_state);
+ mutex_unlock(&mic_data.dd_pm.pm_idle_mutex);
+ err = -ENODEV;
+ goto free_buf;
+ }
+ }
+
+ /* Idle state exit of nodes are complete.
+ * Set the register state now for those nodes
+ * that are successfully up.
+ */
+ for(node_id = 0; node_id <= ms_info.mi_maxid; node_id++) {
+ if (node_id == SCIF_HOST_NODE)
+ continue;
+
+ if (!get_nodemask_bit(nodemask_buf, node_id))
+ continue;
+
+ node_ctx = get_per_dev_ctx(node_id - 1);
+ if (!node_ctx) {
+ PM_DEBUG("Failed to retrieve node context.");
+ continue;
+ }
+
+
+ if (node_ctx->micpm_ctx.idle_state == PM_IDLE_STATE_PC0)
+ set_host_state(node_ctx, PM_IDLE_STATE_PC0);
+ }
+
+ mutex_unlock(&mic_data.dd_pm.pm_idle_mutex);
+free_buf:
+ kfree(nodemask_buf);
+abort_node_wake:
+ return err;
+}
+
+int pc6_entry_start(mic_ctx_t *mic_ctx) {
+
+ int err = 0;
+
+ if (mic_ctx->micpm_ctx.idle_state == PM_IDLE_STATE_PC0) {
+ PM_DEBUG("Node not in PC3\n");
+ err = -EFAULT;
+ goto exit;
+ }
+
+ mutex_lock(&mic_data.dd_pm.pm_idle_mutex);
+
+ if (mic_ctx->micpm_ctx.idle_state != PM_IDLE_STATE_PC3) {
+ PM_DEBUG("PC6 transition failed. Node not in PC3\n");
+ mutex_unlock(&mic_data.dd_pm.pm_idle_mutex);
+ err = -EINVAL;
+ goto exit;
+ }
+
+ if ((err = pm_pc3_to_pc6_entry(mic_ctx))) {
+ PM_DEBUG("PC6 transition from PC3 failed for node %d\n",
+ mic_get_scifnode_id(mic_ctx));
+ mutex_unlock(&mic_data.dd_pm.pm_idle_mutex);
+ goto exit;
+ }
+ mutex_unlock(&mic_data.dd_pm.pm_idle_mutex);
+exit:
+ return err;
+
+}
+
+/*
+ * mic_get_scifnode_id:
+ *
+ * Function to retrieve node id of a scif node.
+ *
+ * mic_ctx: The driver context of the specified node.
+ * Returns the scif node_id of the specified node.
+ */
+uint32_t mic_get_scifnode_id(mic_ctx_t *mic_ctx) {
+ /* NOTE: scif node_id cannot assumed to be a simple increment
+ * of the bi_id of the driver context. This function is really
+ * a placeholder for the board_id to node_id conversion that
+ * we need to do in the host driver.
+ */
+ return (uint32_t)mic_ctx->bi_id + 1;
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/* code to send escape calls to uOS; meant to test the ring buffer */
+
+#include "mic_common.h"
+#include "mic/mic_dma_lib.h"
+#include "mic/mic_dma_api.h"
+#include <mic/micscif.h>
+#include <mic/micscif_smpt.h>
+
+// constants defined for flash commands for setting PCI aperture
+#define RASMM_DEFAULT_OFFSET 0x4000000
+#define RASMM_FLASH_SIZE 0x200000
+#define MAX_CORE_INDEX 61
+#define SKU_MEM_DIVIDE 4
+#define SKU_LOW_MEM 0
+#define SKU_HIGH_MEM 1
+#define FREQ_2P4 0x630
+#define FREQ_4P5 0x65A
+#define FREQ_5P0 0x664
+#define FREQ_5P5 0x66E
+#define MASK_MEMFREQ 0xfff
+#define SHIFT_MEMFREQ 16
+
+int
+mic_unpin_user_pages(struct page **pages, uint32_t nf_pages)
+{
+ uint32_t j = 0;
+ uint32_t status = 0;
+ if (pages) {
+ for (j = 0; j < nf_pages; j++) {
+ if (pages[j]) {
+ SetPageDirty(pages[j]);
+ page_cache_release(pages[j]);
+ }
+ }
+ kfree(pages);
+ }
+
+ return status;
+}
+
+int
+mic_pin_user_pages (void *data, struct page **pages, uint32_t len, int32_t *nf_pages, int32_t nr_pages)
+{
+
+ int32_t status = 0;
+
+
+ if (!(pages)) {
+ printk("%s Failed to allocate memory for pages\n", __func__);
+ status = -ENOMEM;
+ return status;
+
+ }
+
+ // pin the user pages; use semaphores on linux for doing the same
+ down_read(¤t->mm->mmap_sem);
+ *nf_pages = (int32_t)get_user_pages(current, current->mm, (uint64_t)data,
+ nr_pages, PROT_WRITE, 1, pages, NULL);
+ up_read(¤t->mm->mmap_sem);
+
+ // compare if the no of final pages is equal to no of requested pages
+ if ((*nf_pages) < nr_pages) {
+ printk("%s failed to do _get_user_pages\n", __func__);
+ status = -EFAULT;
+ mic_unpin_user_pages(pages, *nf_pages);
+ return status;
+ }
+
+
+ return status;
+
+}
+
+int
+send_flash_cmd(mic_ctx_t *mic_ctx, MIC_FLASH_CMD_TYPE type, void *data, uint32_t len)
+{
+ int32_t status = 0;
+ uint8_t *mmio_va = mic_ctx->mmio.va;
+ sbox_scratch1_reg_t scratch1reg = {0};
+ sbox_scratch2_reg_t scratch2reg = {0};
+ uint32_t ret = 0;
+ void *src;
+ struct timeval t;
+ struct flash_stat *statbuf = NULL;
+ uint64_t temp;
+ uint32_t i = 0;
+ struct version_struct *verbuf = NULL;
+ int32_t offset = 0;
+ uint8_t cmddata = 0;
+
+ scratch1reg.bits.status = FLASH_CMD_INVALID;
+ switch (type) {
+ case FLASH_CMD_READ:
+
+ /*
+ * image address = the upper 20 bits of the 32-bit of scracth2 register
+ * is card side physical address where the flash image resides
+ * program scratch2 register to notify the image address
+ */
+ scratch2reg.bits.image_addr = RASMM_DEFAULT_OFFSET >> 12;
+ SBOX_WRITE(scratch2reg.value, mmio_va, SBOX_SCRATCH2);
+
+ /* set command */
+ scratch1reg.bits.command = FLASH_CMD_READ;
+ SBOX_WRITE(scratch1reg.value, mmio_va, SBOX_SCRATCH1);
+
+ mic_send_bootstrap_intr(mic_ctx);
+ break;
+
+ case FLASH_CMD_READ_DATA:
+
+ /*
+ * flash read_data command : set pci aperture to 128MB
+ * read the value of scratch2 in a variable
+ */
+ ret = SBOX_READ(mmio_va, SBOX_SCRATCH2);
+ scratch2reg.value = ret;
+
+ /*
+ * convert physical to virtual address
+ * image address = the upper 20 bits of the 32-bit KNC side physical
+ * address where the flash image resides
+ */
+ offset = scratch2reg.bits.image_addr << 12 ;
+ if (len == 0) {
+ status = -EINVAL;
+ goto exit;
+ }
+
+ if (len > (mic_ctx->aper.len - offset)) {
+ status = -EINVAL;
+ goto exit;
+ }
+ src = mic_ctx->aper.va + offset;
+
+ temp = copy_to_user(data, src, len);
+ if (temp > 0) {
+ printk("error while copy to user \n");
+ status = -EFAULT;
+ goto exit;
+ }
+ break;
+
+ case FLASH_CMD_ABORT:
+
+ scratch1reg.bits.command = FLASH_CMD_ABORT;
+ SBOX_WRITE(scratch1reg.value, mmio_va, SBOX_SCRATCH1);
+
+ mic_send_bootstrap_intr(mic_ctx);
+ break;
+
+ case FLASH_CMD_VERSION:
+
+ /*
+ * image address = the upper 20 bits of the 32-bit of scracth2 register
+ * is card side physical address where the flash image resides
+ */
+ scratch2reg.bits.image_addr = RASMM_DEFAULT_OFFSET >> 12;
+ SBOX_WRITE(scratch2reg.value, mmio_va, SBOX_SCRATCH2);
+
+ /*
+ * flash version command : similar to read_data command.
+ * Instead of get_user_pages(), use kmalloc() as we are allocating
+ * buffer of lesser size
+ */
+ scratch1reg.bits.command = FLASH_CMD_VERSION;
+ SBOX_WRITE(scratch1reg.value, mmio_va, SBOX_SCRATCH1);
+
+ mic_send_bootstrap_intr(mic_ctx);
+
+ /* poll for completion */
+ while(scratch1reg.bits.status != FLASH_CMD_COMPLETED) {
+ ret = SBOX_READ(mmio_va, SBOX_SCRATCH1);
+ scratch1reg.value = ret;
+ msleep(1);
+ i++;
+ printk("Looping for status (time = %d ms)\n", i);
+ if(i > 3000) {
+ status = -ETIME;
+ goto exit;
+ }
+
+ }
+
+ src = mic_ctx->aper.va + RASMM_DEFAULT_OFFSET;
+
+ if (len == 0) {
+ status = -EINVAL;
+ goto exit;
+ }
+ verbuf = kmalloc(len, GFP_KERNEL);
+ if (!verbuf) {
+ status = -ENOMEM;
+ goto exit;
+ }
+
+ memcpy(verbuf, src, len);
+
+ printk("header verbuf is : %x\n", verbuf->hdr_ver);
+ printk("odm verbuf is : %x\n", verbuf->odm_ver);
+ printk("uptd time bcd is : %llu\n", verbuf->upd_time_bcd);
+ printk("updated verbuf is : %d\n", *((int*)(&verbuf->upd_ver)));
+ printk("mfg time bcd is : %llu\n", verbuf->mfg_time_bcd);
+ printk("mfg verbuf is : %d\n", *((int*)(&verbuf->mfg_ver)));
+
+ temp = copy_to_user(data, verbuf, len);
+ if(temp > 0) {
+ printk("error while copy to user \n");
+ status = -EFAULT;
+ if(verbuf) {
+ kfree(verbuf);
+ }
+ goto exit;
+ }
+
+ if(verbuf) {
+ kfree(verbuf);
+ }
+
+ break;
+
+ case FLASH_CMD_WRITE:
+
+ /* flash write command : pin user pages for the data buffer which contains
+ * the image.
+ * For the write command, we provide the offset for writing.
+ * GTT is set to 64MB and offset = 0.
+ */
+ if (len > (mic_ctx->aper.len - RASMM_DEFAULT_OFFSET)) {
+ status = -EINVAL;
+ goto exit;
+ }
+ src = mic_ctx->aper.va + RASMM_DEFAULT_OFFSET;
+ if (len == 0) {
+ status = -EINVAL;
+ goto exit;
+ }
+ temp = copy_from_user(src, data, len);
+ if (temp > 0) {
+ printk("error while copying from user \n");
+ status = -EFAULT;
+ goto exit;
+ }
+
+ /* image address = the upper 20 bits of the 32-bit KNC side physical
+ * address where the flash image resides
+ */
+ scratch2reg.bits.image_addr = RASMM_DEFAULT_OFFSET >> 12;
+ SBOX_WRITE(scratch2reg.value, mmio_va, SBOX_SCRATCH2);
+
+ scratch1reg.bits.command = FLASH_CMD_WRITE;
+ SBOX_WRITE(scratch1reg.value, mmio_va, SBOX_SCRATCH1);
+
+ mic_send_bootstrap_intr(mic_ctx);
+ ;
+
+ break;
+
+ case RAS_CMD_CORE_DISABLE:
+ case RAS_CMD_CORE_ENABLE:
+ if (copy_from_user(&cmddata, data, sizeof(cmddata))) {
+ status = -EFAULT;
+ goto exit;
+ }
+ scratch1reg.bits.cmd_data = cmddata;
+ if (cmddata > MAX_CORE_INDEX) {
+ printk("Parameter given is greater than physical core index\n");
+ status = -EINVAL;
+ goto exit;
+ }
+
+ case RAS_CMD:
+ case RAS_CMD_INJECT_REPAIR:
+ case RAS_CMD_ECC_DISABLE:
+ case RAS_CMD_ECC_ENABLE:
+ case RAS_CMD_EXIT:
+ do_gettimeofday(&t);
+ SBOX_WRITE(t.tv_sec, mmio_va, SBOX_SCRATCH3);
+ scratch1reg.bits.command = type;
+ SBOX_WRITE(scratch1reg.value, mmio_va, SBOX_SCRATCH1);
+
+ mic_send_bootstrap_intr(mic_ctx);
+
+ break;
+
+ case FLASH_CMD_STATUS:
+
+ /* status command : mmio read of SCRATCH1 register
+ * The percentage completion is only updated on the
+ * Flash Write function as currently implemented.
+ * The other functions are expected to complete almost instantly
+ */
+ if(len != sizeof(struct flash_stat)) {
+ status = -EINVAL;
+ goto exit;
+ }
+ if (len == 0) {
+ status = -EINVAL;
+ goto exit;
+ }
+ statbuf = kmalloc(len, GFP_KERNEL);
+ if(!statbuf) {
+ status = -ENOMEM;
+ goto exit;
+ }
+
+ temp = SBOX_READ(mmio_va, SBOX_SCRATCH1);
+ scratch1reg.value = (uint32_t)temp;
+
+ statbuf->status = scratch1reg.bits.status;
+ statbuf->percent = scratch1reg.bits.percent;
+ statbuf->smc_status = scratch1reg.bits.smc_status;
+ statbuf->cmd_data = scratch1reg.bits.cmd_data;
+ statbuf->mm_debug = scratch1reg.bits.mm_debug;
+
+ temp = copy_to_user(data, statbuf, len);
+ if(temp > 0) {
+ printk("Error copying data to user buffer\n");
+ status = -EFAULT;
+ if(statbuf) {
+ kfree(statbuf);
+ }
+ goto exit;
+ }
+
+ if(statbuf) {
+ kfree(statbuf);
+ }
+
+ break;
+
+ default:
+ printk(KERN_ERR "Unknown command\n");
+ status = -EOPNOTSUPP;
+ break;
+
+ }
+
+ exit :
+ return status;
+}
+
+int get_cardside_mem(mic_ctx_t *mic_ctx, uint64_t start, uint64_t size, void *dest)
+{
+ int32_t status = 0;
+ uint64_t len;
+ uint64_t dest_pa;
+ struct dma_channel *ch = NULL;
+ int flags = 0;
+ int poll_cookie;
+ int i, next_page;
+ int j;
+ uint64_t num_pages;
+ uint64_t card_pa;
+ int32_t nf_pages = 0;
+ uint64_t nr_pages = 0;
+ struct page **pages = NULL;
+ void *pg_virt_add;
+ unsigned long t = jiffies;
+ int dma_ret = 0;
+ card_pa = start;
+ len = size;
+
+ if (len % PAGE_SIZE)
+ nr_pages = (len >> PAGE_SHIFT) + 1;
+ else
+ nr_pages = len >> PAGE_SHIFT;
+
+ flags |= DO_DMA_POLLING;
+ num_pages = len / PAGE_SIZE;
+ next_page = 0;
+
+ pages = kmalloc(nr_pages * sizeof(struct page*), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+ status = mic_pin_user_pages(dest, pages, (uint32_t)len, &nf_pages, (int32_t)nr_pages);
+
+ if (status)
+ goto exit;
+
+ /* allocate_dma_channel should fail in 2 cases : 1. if it doesnt get dma channel
+ * then it times out 2. there is no device present
+ */
+ status = micpm_get_reference(mic_ctx, true);
+ if (status)
+ goto exit;
+
+ while ((dma_ret = allocate_dma_channel(mic_ctx->dma_handle, &ch)) != 0) {
+ if (dma_ret == -ENODEV) {
+ printk("No device present\n");
+ status = -ENODEV;
+ goto put_ref;
+ }
+ msleep(1);
+ if (time_after(jiffies,t + NODE_ALIVE_TIMEOUT)) {
+ printk("dma channel allocation error\n");
+ status = -EBUSY;
+ goto put_ref;
+ }
+ }
+
+ for(j = 0; j < num_pages; j++) {
+ i = 0;
+ pg_virt_add = lowmem_page_address(pages[j]);
+ /* get card side address */
+ dest_pa = mic_ctx_map_single(mic_ctx, pg_virt_add, PAGE_SIZE);
+
+ /* do dma and keep polling for completion */
+ poll_cookie = do_dma(ch, flags, card_pa + next_page, dest_pa, PAGE_SIZE, NULL);
+ pr_debug("Poll cookie %d\n", poll_cookie);
+ if (0 > poll_cookie) {
+ printk("Error programming the dma descriptor\n");
+ status = poll_cookie;
+ goto put_ref;
+ } else if (-2 == poll_cookie) {
+ printk( "Copy was done successfully, check for validity\n");
+ } else if(-1 != poll_cookie) {
+ while (i < 10000 && 1 != poll_dma_completion(poll_cookie, ch)) {
+ i++;
+ }
+ if (i == 10000) {
+ printk("DMA timed out \n");
+ } else {
+ pr_debug("DMA SUCCESS at %d\n", i);
+ /* increment by PAGE_SIZE on DMA SUCCESS to transfer next page */
+ next_page = next_page + PAGE_SIZE;
+ }
+ }
+ mic_ctx_unmap_single(mic_ctx, (dma_addr_t)dest_pa, PAGE_SIZE);
+ }
+
+put_ref:
+ micpm_put_reference(mic_ctx);
+exit:
+ mic_unpin_user_pages(pages, nf_pages);
+ if (ch)
+ free_dma_channel(ch);
+ return status;
+}
+
+/* SKU functions */
+void
+sku_swap_list(struct list_head *in, struct list_head *out)
+{
+ struct list_head *pos, *tmp;
+ sku_info_t *node;
+ list_for_each_safe(pos, tmp, in) {
+ node = list_entry(pos, sku_info_t, sku);
+ list_del(pos);
+ list_add_tail(&node->sku, out);
+ }
+}
+
+int
+sku_create_node(uint32_t fuserev_low,
+ uint32_t fuserev_high, uint32_t mem_size,
+ uint32_t mem_freq, char *sku_name,
+ sku_info_t ** newnode)
+{
+ sku_info_t *temp;
+
+ temp = kmalloc(sizeof(sku_info_t), GFP_KERNEL);
+ if (temp == NULL)
+ return -ENOMEM;
+ temp->fuserev_low = fuserev_low;
+ temp->fuserev_high = fuserev_high;
+ temp->memsize = mem_size;
+ temp->memfreq = mem_freq;
+ strncpy(temp->sku_name, sku_name, SKU_NAME_LEN - 1);
+ temp->sku_name[SKU_NAME_LEN - 1] = '\0';
+ *newnode = temp;
+ return 0;
+}
+
+void
+sku_destroy_table()
+{
+ int i;
+ sku_info_t *node;
+ struct list_head *pos, *tmp;
+ for (i = 0; i < MAX_DEV_IDS; i++)
+ list_for_each_safe(pos, tmp, &mic_data.sku_table[i]) {
+ node = list_entry(pos, sku_info_t, sku);
+ list_del(pos);
+ kfree(node);
+ }
+}
+
+int
+sku_find(mic_ctx_t *mic_ctx, uint32_t device_id)
+{
+ int ret = 0;
+ uint32_t cnt = 0;
+ sku_info_t *match, *newnode = NULL, *skunode;
+ struct list_head skulist_memsize_in;
+ struct list_head skulist_memfreq_in;
+ struct list_head skulist_out;
+ uint32_t fuse_rev, memsize, memfreq;
+ struct list_head *pos, *tmp;
+ const char *invalid = "INVALID SKU";
+
+ /* Use the LSB as index to the array of pointers to the SKU table*/
+ device_id = device_id & 0xf;
+
+ if (device_id > MAX_DEV_IDS) {
+ strncpy(mic_ctx->sku_name, invalid, SKU_NAME_LEN - 1);
+ mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0';
+ return -EINVAL;
+ }
+
+ INIT_LIST_HEAD(&skulist_memsize_in);
+ INIT_LIST_HEAD(&skulist_memfreq_in);
+ INIT_LIST_HEAD(&skulist_out);
+
+ /* Search by fuse_config_rev */
+ fuse_rev = SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH7);
+ fuse_rev = (fuse_rev >> SHIFT_FUSE_CONFIG_REV) & MASK_FUSE_CONFIG_REV;
+
+ list_for_each_safe(pos, tmp, &mic_data.sku_table[device_id]) {
+ match = list_entry(pos, sku_info_t, sku);
+ if ((match->fuserev_low <= fuse_rev) && (match->fuserev_high >= fuse_rev)) {
+ cnt++;
+ ret = sku_create_node(match->fuserev_low, match->fuserev_high,
+ match->memsize, match->memfreq, match->sku_name, &newnode);
+ if (ret) {
+ strncpy(mic_ctx->sku_name, invalid, SKU_NAME_LEN - 1);
+ mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0';
+ goto cleanup;
+ }
+ list_add_tail(&newnode->sku, &skulist_out);
+ }
+ }
+ /* If only one node is present, the match has been found */
+ if (cnt == 1) {
+ strncpy(mic_ctx->sku_name, newnode->sku_name, SKU_NAME_LEN - 1);
+ mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0';
+ goto cleanup;
+ }
+
+ sku_swap_list(&skulist_out, &skulist_memsize_in);
+ /* Search by memsize */
+ memsize = SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH0);
+ memsize = (memsize >> SHIFT_MEMSIZE) & MASK_MEMSIZE;
+ memsize = memsize >> 20;
+ if (memsize > SKU_MEM_DIVIDE)
+ memsize = SKU_HIGH_MEM;
+ else
+ memsize = SKU_LOW_MEM;
+
+ cnt = 0;
+ list_for_each_safe(pos, tmp, &skulist_memsize_in) {
+ match = list_entry(pos, sku_info_t, sku);
+ /* Use the MSB for comparison */
+ /* Assumption - From the latest documentation, a particular
+ * combination of device id and fuse_rev can either have memory
+ * <=4GB (SKU_LOW_MEM) or > 4GB (SKU_HIGH_MEM)
+ */
+ if (memsize == match->memsize) {
+ cnt++;
+ ret = sku_create_node(match->fuserev_low, match->fuserev_high,
+ match->memsize, match->memfreq, match->sku_name, &newnode);
+ if (ret) {
+ strncpy(mic_ctx->sku_name, invalid, SKU_NAME_LEN - 1);
+ mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0';
+ goto cleanup;
+ }
+ list_add_tail(&newnode->sku, &skulist_out);
+ }
+
+ }
+ list_for_each_safe(pos, tmp, &skulist_memsize_in) {
+ skunode = list_entry(pos, sku_info_t, sku);
+ list_del(pos);
+ kfree(skunode);
+ }
+ if (cnt == 1) {
+ strncpy(mic_ctx->sku_name, newnode->sku_name, SKU_NAME_LEN - 1);
+ mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0';
+ goto cleanup;
+ }
+
+ sku_swap_list(&skulist_out, &skulist_memfreq_in);
+ /* Search by memfreq */
+ memfreq = SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH9);
+ memfreq = (memfreq >> SHIFT_MEMFREQ) & MASK_MEMFREQ;
+
+ cnt = 0;
+ list_for_each_safe(pos, tmp, &skulist_memfreq_in) {
+ match = list_entry(pos, sku_info_t, sku);
+ if (memfreq == match->memfreq) {
+ cnt++;
+ ret = sku_create_node(match->fuserev_low, match->fuserev_high,
+ match->memsize, match->memfreq, match->sku_name, &newnode);
+ if (ret) {
+ strncpy(mic_ctx->sku_name, invalid, SKU_NAME_LEN - 1);
+ mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0';
+ goto cleanup;
+ }
+ list_add_tail(&newnode->sku, &skulist_out);
+ }
+
+ }
+ list_for_each_safe(pos, tmp, &skulist_memfreq_in) {
+ skunode = list_entry(pos, sku_info_t, sku);
+ list_del(pos);
+ kfree(skunode);
+ }
+ if (cnt == 1) {
+ strncpy(mic_ctx->sku_name, newnode->sku_name, SKU_NAME_LEN - 1);
+ mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0';
+ } else {
+ strncpy(mic_ctx->sku_name, invalid, SKU_NAME_LEN - 1);
+ mic_ctx->sku_name[SKU_NAME_LEN - 1] = '\0';
+ }
+
+
+cleanup:
+ list_for_each_safe(pos, tmp, &skulist_out) {
+ skunode = list_entry(pos, sku_info_t, sku);
+ list_del(pos);
+ kfree(skunode);
+ }
+
+ return ret;
+}
+
+
+int
+sku_build_table(void)
+{
+ int i = 0;
+ sku_info_t *newnode = NULL;
+
+ for ( i = 0; i < MAX_DEV_IDS; i++)
+ INIT_LIST_HEAD(&mic_data.sku_table[i]);
+
+ /*2250*/
+ if (sku_create_node(0, 1, SKU_LOW_MEM, FREQ_2P4, "A0PO-SKU1", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(0, 1, SKU_HIGH_MEM, FREQ_2P4, "A0PO-SKU1", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(2, 3, SKU_LOW_MEM, FREQ_4P5,"ES1-SKU2", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(2, 3, SKU_HIGH_MEM, FREQ_4P5, "ES1-SKU2", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(4, 49, SKU_HIGH_MEM, FREQ_4P5, "ES1B-SKU2", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(50, 100, SKU_HIGH_MEM, FREQ_4P5, "B0PO-SKU2", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(101, 150, SKU_HIGH_MEM, FREQ_5P0, "ES2-P1640", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(153, 154, SKU_HIGH_MEM, FREQ_5P0, "B1PO-5110P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(151, 152, SKU_HIGH_MEM, FREQ_5P0, "B1PO-P1640/D1650", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(156, 156, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-5110P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(155, 155, SKU_HIGH_MEM, FREQ_5P0, "B1QS-5110P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(157, 157, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-5110P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(158, 250, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-5110P/5120D", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(251, 253, SKU_HIGH_MEM, FREQ_5P0, "C0-5110P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(254, 255, SKU_HIGH_MEM, FREQ_5P0, "C0QS-5110P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(251, 253, SKU_HIGH_MEM, FREQ_5P5, "C0-5120D", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(254, 255, SKU_HIGH_MEM, FREQ_5P5, "C0QS-5120D", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(256, 350, SKU_HIGH_MEM, FREQ_5P0, "C0PRQ-5110P/5140P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ if (sku_create_node(256, 350, SKU_HIGH_MEM, FREQ_5P5, "C0PRQ-5120D/5140D", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[0]);
+
+ /*2251*/
+ if (sku_create_node(0, 1, SKU_LOW_MEM, FREQ_2P4, "A0PO-SKU2", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[1]);
+
+ if (sku_create_node(0, 1, SKU_HIGH_MEM, FREQ_2P4, "A0PO-SKU2", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[1]);
+
+ /*2252*/
+ if (sku_create_node(0, 1, SKU_LOW_MEM, FREQ_2P4, "A0PO-SKU3", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[2]);
+
+ /*2253*/
+ if (sku_create_node(0, 1, SKU_LOW_MEM, FREQ_2P4, "A0PO-SKU4", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[3]);
+
+ if (sku_create_node(2, 3, SKU_LOW_MEM, FREQ_2P4, "ES1-SKU5", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[3]);
+
+ if (sku_create_node(4, 49, SKU_LOW_MEM, FREQ_2P4, "ES1B-SKU5", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[3]);
+
+ if (sku_create_node(50, 100, SKU_LOW_MEM, FREQ_4P5, "B0PO-SKU5", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[3]);
+
+ /*2254*/
+
+ /*2255*/
+ if (sku_create_node(0, 1, SKU_LOW_MEM, FREQ_2P4, "A0PO-SKUX", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[5]);
+
+ /*2256*/
+ if (sku_create_node(0, 1, SKU_LOW_MEM, FREQ_2P4, "A0PO-SKU5", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[6]);
+
+ /*2257*/
+ if (sku_create_node(0, 1, SKU_LOW_MEM, FREQ_2P4, "A0PO-SKUZ", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[7]);
+
+ /*2258*/
+ if (sku_create_node(2, 3, SKU_LOW_MEM, FREQ_4P5, "ES1-SKU1", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[8]);
+ if (sku_create_node(2, 3, SKU_HIGH_MEM, FREQ_4P5, "ES1-SKU1", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[8]);
+ if (sku_create_node(4, 49, SKU_HIGH_MEM, FREQ_5P5, "ES1B-SKU1", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[8]);
+ if (sku_create_node(50, 100, SKU_HIGH_MEM, FREQ_5P5, "B0PO-SKU1", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[8]);
+
+ /*2259*/
+ if (sku_create_node(2, 3, SKU_LOW_MEM, FREQ_4P5, "ES1-SKU3", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[9]);
+
+ if (sku_create_node(2, 3, SKU_HIGH_MEM, FREQ_4P5, "ES1-SKU3", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[9]);
+
+ /*225A*/
+ if (sku_create_node(2, 3, SKU_LOW_MEM, FREQ_4P5, "ES1-SKU4", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[10]);
+
+ if (sku_create_node(4, 49, SKU_LOW_MEM, FREQ_5P0, "ES1B-SKU4", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[10]);
+
+ if (sku_create_node(50, 100, SKU_LOW_MEM, FREQ_5P0, "B0PO-SKU4", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[10]);
+
+ if (sku_create_node(101, 150, SKU_LOW_MEM, FREQ_5P0, "ES2-SKU4", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[10]);
+
+ /*225B*/
+ if (sku_create_node(4, 49, SKU_HIGH_MEM, FREQ_5P5, "ES1B-SKU3cs", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[11]);
+
+ if (sku_create_node(4, 49, SKU_LOW_MEM, FREQ_5P5, "ES1B-SKU3ncs", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[11]);
+
+ if (sku_create_node(50, 100, SKU_HIGH_MEM, FREQ_5P5, "B0PO-SKU3cs", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[11]);
+
+ if (sku_create_node(50, 100, SKU_LOW_MEM, FREQ_5P5, "B0PO-SKU3ncs", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[11]);
+
+ /*225C*/
+ if (sku_create_node(101, 150, SKU_HIGH_MEM, FREQ_5P5, "ES2-P/A/X 1750", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[12]);
+
+ if (sku_create_node(153, 154, SKU_HIGH_MEM, FREQ_5P5, "B1PO-7110 P/A/X", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[12]);
+
+ if (sku_create_node(155, 155, SKU_HIGH_MEM, FREQ_5P5, "B1QS-7110 P/A/X", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[12]);
+
+ if (sku_create_node(151, 152, SKU_HIGH_MEM, FREQ_5P0, "B1PO-P/A 1750", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[12]);
+
+ if (sku_create_node(156, 156, SKU_HIGH_MEM, FREQ_5P5, "B1PRQ-7110 P/A/X", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[12]);
+
+ if (sku_create_node(157, 157, SKU_HIGH_MEM, FREQ_5P5, "B1PRQ-7110 P/A/X", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[12]);
+
+ if (sku_create_node(158, 202, SKU_HIGH_MEM, FREQ_5P5, "B1PRQ-7110 P/X", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[12]);
+
+ if (sku_create_node(203, 250, SKU_HIGH_MEM, FREQ_5P5, "B1PRQ-SE10 P/X", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[12]);
+
+ if (sku_create_node(251, 253, SKU_HIGH_MEM, FREQ_5P5, "C0-7120 P/A/X/D", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[12]);
+
+ if (sku_create_node(254, 255, SKU_HIGH_MEM, FREQ_5P5, "C0QS-7120 P/A/X/D", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[12]);
+
+ if (sku_create_node(256, 350, SKU_HIGH_MEM, FREQ_5P5, "C0PRQ-7120 P/A/X/D", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[12]);
+
+ /*225D*/
+ if (sku_create_node(101, 150, SKU_LOW_MEM, FREQ_5P0, "ES2-P1310", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[13]);
+
+ if (sku_create_node(101, 150, SKU_HIGH_MEM, FREQ_5P0, "ES2-A1330", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[13]);
+
+ if (sku_create_node(153, 154, SKU_LOW_MEM, FREQ_5P0, "B1PO-3110P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[13]);
+
+ if (sku_create_node(153, 154, SKU_HIGH_MEM, FREQ_5P0, "B1PO-3115A", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[13]);
+
+ if (sku_create_node(157, 157, SKU_LOW_MEM, FREQ_5P0, "B1PRQ-3110P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[13]);
+
+ if (sku_create_node(157, 157, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-3115A", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[13]);
+
+ if (sku_create_node(156, 156, SKU_LOW_MEM, FREQ_5P0, "B1PRQ-3110P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[13]);
+
+ if (sku_create_node(156, 156, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-3115A", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[13]);
+
+ if (sku_create_node(155, 155, SKU_HIGH_MEM, FREQ_5P0, "B1QS-3115A", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[13]);
+
+ if (sku_create_node(155, 155, SKU_LOW_MEM, FREQ_5P0, "B1QS-3110P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[13]);
+
+ if (sku_create_node(158, 250, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-3120P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[13]);
+
+ if (sku_create_node(251, 253, SKU_HIGH_MEM, FREQ_5P0, "C0-3120 P/A", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[13]);
+
+ if (sku_create_node(254, 255, SKU_HIGH_MEM, FREQ_5P0, "C0QS-3120 P/A", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[13]);
+
+ if (sku_create_node(256, 350, SKU_HIGH_MEM, FREQ_5P0, "C0PRQ-3120/3140 P/A", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[13]);
+
+ /*225E*/
+ if (sku_create_node(157, 157, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-31S1P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[14]);
+
+ if (sku_create_node(158, 250, SKU_HIGH_MEM, FREQ_5P0, "B1PRQ-31S1P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[14]);
+
+ if (sku_create_node(251, 253, SKU_HIGH_MEM, FREQ_5P0, "C0-31S1P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[14]);
+
+ if (sku_create_node(254, 255, SKU_HIGH_MEM, FREQ_5P0, "C0QS-31S1P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[14]);
+
+ if (sku_create_node(256, 350, SKU_HIGH_MEM, FREQ_5P0, "C0PRQ-31S1P", &newnode))
+ return -ENOMEM;
+ list_add_tail(&newnode->sku, &mic_data.sku_table[14]);
+
+ return 0; // Successed
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/* contains code to download uos on MIC card */
+
+#include "mic_common.h"
+#include <mic/ringbuffer.h>
+#include "micint.h"
+#include <linux/virtio_ring.h>
+#include <linux/virtio_blk.h>
+#include "mic/mic_virtio.h"
+#include <linux/proc_fs.h>
+#include "mic/micveth.h"
+
+
+#define APERTURE_SEGMENT_SIZE ((1) * 1024 * 1024 * 1024ULL)
+
+#define UOS_RESERVE_SIZE_MIN ((128) * 1024 * 1024)
+#define OS_RESERVE_SIZE_MIN ((32) * 1024 * 1024)
+#define UOS_RESERVE_SIZE_MAX (((4) * 1024 * 1024 * 1024ULL) - ((4) * 1024))
+#define UOS_RESERVE_PERCENT 50
+
+#define UOS_WATCHDOG_TIMEOUT 5000 // default watchdog timeout in milliseconds
+
+#define PCIE_CLASS_CODE(x) ((x) >> 24 )
+
+/* zombie class code as per the HAS is 0xFF
+ * but on KNC, we found it as 0x03
+ */
+#define ZOMBIE_CLASS_CODE 0x03
+#define DISABLE_BAR 0x02
+#define RESET_FAILED_F2 12870
+#define RESET_FAILED_F4 13382
+
+void ramoops_remove(mic_ctx_t *mic_ctx);
+
+static struct proc_dir_entry *ramoops_dir;
+struct proc_dir_entry *vmcore_dir;
+
+
+static void adapter_dpc(unsigned long dpc);
+extern int mic_vhost_blk_probe(bd_info_t *bd_info);
+extern void mic_vhost_blk_remove(bd_info_t *bd_info);
+
+/* driver wide global common data */
+mic_data_t mic_data;
+extern int usagemode_param;
+extern bool mic_crash_dump_enabled;
+extern bool mic_watchdog_auto_reboot;
+
+static int64_t etc_comp = 0;
+
+static uint64_t
+etc_read(uint8_t *mmio_va)
+{
+ uint32_t low;
+ uint32_t hi1,hi2;
+
+ do {
+ hi1 = SBOX_READ(mmio_va, SBOX_ELAPSED_TIME_HIGH);
+ low = SBOX_READ(mmio_va, SBOX_ELAPSED_TIME_LOW);
+ hi2 = SBOX_READ(mmio_va, SBOX_ELAPSED_TIME_HIGH);
+ } while(hi1 != hi2);
+
+ return((uint64_t)((((uint64_t)hi1 << 32) | low) >> 5));
+}
+
+static int64_t
+calc_deltaf(mic_ctx_t *mic_ctx)
+{
+ const int64_t ETC_CLK_FREQ = 15625000;
+ const uint32_t TIME_DELAY_IN_SEC = 10;
+ const int64_t etc_cnt1 = ETC_CLK_FREQ * TIME_DELAY_IN_SEC;
+ int64_t etc_cnt2;
+
+ uint64_t cnt1, cnt2;
+ int64_t deltaf_in_ppm, deltaf;
+
+ /*
+ * (etc_freq2 / etc_freq1) = (etc_count2 / etc_count1)
+ * etc_freq1 = ETC_CLK_FREQ
+ * => etc_count1 = TIME_DELAY_IN_SEC * ETC_CLK_FREQ
+ * (etc_freq2 / etc_freq1) = (etc_count2 / etc_count1)
+ * etc_freq2 = etc_freq1 * (etc_count2 / etc_count1)
+ * etc_freq2 - etc_freq1 = etc_freq1((etc_count2 / etc_count1) - 1)
+ * deltaf = etc_freq1(etc_count2 - etc_count1)/etc_count1
+ * deltaf_in_ppm = deltaf * 10 ^ 6 / etc_freq1
+ * deltaf_in_ppm = ((etc_count2 - etc_count1) * 10 ^ 6) / etc_count1
+ */
+ /* Need to implement the monotonic/irqsave logic for windows */
+ unsigned long flags;
+ struct timespec ts1, ts2;
+ int64_t mono_ns;
+ int i = 0;
+ do {
+ local_irq_save(flags);
+ cnt1 = etc_read(mic_ctx->mmio.va);
+ getrawmonotonic(&ts1);
+ local_irq_restore(flags);
+ mdelay(TIME_DELAY_IN_SEC * 1000);
+ local_irq_save(flags);
+ cnt2 = etc_read(mic_ctx->mmio.va);
+ getrawmonotonic(&ts2);
+ local_irq_restore(flags);
+ etc_cnt2 = cnt2 - cnt1;
+ ts2 = timespec_sub(ts2, ts1);
+ mono_ns = timespec_to_ns(&ts2);
+ /* Recalculate etc_cnt2 based on getrawmonotonic */
+ etc_cnt2 = (etc_cnt2 * TIME_DELAY_IN_SEC * 1000 * 1000 * 1000) / mono_ns;
+ deltaf = ( ETC_CLK_FREQ * (etc_cnt2 - etc_cnt1)) / etc_cnt1;
+ deltaf_in_ppm = (1000 * 1000 * (etc_cnt2 - etc_cnt1)) / etc_cnt1;
+ i++;
+ /*
+ * HSD #4844900
+ * On some of the systems deltaf_in_ppm is turning out
+ * way higher than expected. The only reasons I can think of
+ * are:
+ * i) mmio traffic cauing variable delays for mmio read
+ * ii) NMIs affecting this code
+ */
+ } while (i < 10 && (deltaf_in_ppm > 2700 || deltaf_in_ppm < -2700));
+
+ pr_debug("etc deltaf: %lld\n", deltaf);
+ /*
+ * For intel chipsets, Spread Spectrum Clocking (SSC) (in the limit)
+ * is downspread with a frequency of 30hz and an amplitude of 0.5%
+ * which translates to 2500ppm. This is also the ppm observed on KNC + CrownPass
+ * Hence, if ppm > 2500, the code would need to retry to eliminate any chance of error
+ * Added an error margin of 1ppm (etc mmio reads can take really long time)
+ */
+ if (deltaf_in_ppm > 2700 || deltaf_in_ppm < -2700) {
+ printk(KERN_ERR "ETC timer compensation(%lldppm) is much higher"
+ "than expected\n", deltaf_in_ppm);
+ /*
+ * HSD #4844900
+ * Clamp etc compensation to 2500ppm
+ */
+ if (deltaf_in_ppm > 2700)
+ deltaf_in_ppm = 2500;
+ else
+ deltaf_in_ppm = -2500;
+ deltaf = (ETC_CLK_FREQ * deltaf_in_ppm) / (1000 * 1000);
+ }
+ if (deltaf > 0 && deltaf <= 10)
+ deltaf = 0;
+ return deltaf;
+}
+
+void
+calculate_etc_compensation(mic_ctx_t *mic_ctx)
+{
+ if (mic_ctx->bi_family == FAMILY_KNC) {
+ if (!etc_comp)
+ etc_comp = calc_deltaf(mic_ctx);
+ mic_ctx->etc_comp = etc_comp;
+ }
+}
+
+/*
+ DESCRIPTION:: waits for bootstrap loader is finished
+ PARAMETERS::
+ [in]void *mmio_va - virtual address to access MMIO registers
+ RETURN_VALUE:: 0 if successful, non-zero if failure
+*/
+int
+wait_for_bootstrap(uint8_t *mmio_va)
+{
+ uint32_t scratch2 = 0;
+ int count = 0;
+#ifdef MIC_IS_EMULATION
+ int wait_time = 0;
+#endif
+
+ // Wait until the boot loader is finished
+ while (!SCRATCH2_DOWNLOAD_STATUS(scratch2)) {
+ msleep(100);
+ if (count == 600) {
+#ifndef MIC_IS_EMULATION
+ printk("Firmware is not responding with ready bit\n");
+ return -EIO;
+#else
+ /* We don't want to be polling too often on the emulator, it is SLOW! */
+ pr_debug("Wait for bootstrap: %d min(s) \n", wait_time++);
+ count = 0;
+#endif
+ }
+
+ count++;
+ scratch2 = SBOX_READ(mmio_va, SBOX_SCRATCH2);
+ }
+
+ return 0;
+}
+
+/*
+ DESCRIPTION::gets adapter memory size. calculates size based on scratch register 0
+ PARAMETERS::
+ [in]void *mmio_va - virtual address to access MMIO registers
+ [out]uint32_t *adapter_mem_size - adapter memory size
+ RETURN_VALUE:: none
+*/
+void
+get_adapter_memsize(uint8_t *mmio_va, uint32_t *adapter_mem_size)
+{
+ uint32_t memsize = 0;
+ uint32_t scratch0 = {0};
+
+ scratch0 = SBOX_READ(mmio_va, SBOX_SCRATCH0);
+ memsize = SCRATCH0_MEM_SIZE_KB(scratch0) * ((1) * 1024);
+
+ // Adjust the memory size based on the memory usage
+ switch (SCRATCH0_MEM_USAGE(scratch0)) {
+ case SCR0_MEM_ALL:
+ // Do nothing
+ break;
+
+ case SCR0_MEM_HALF:
+ memsize /= 2;
+ break;
+
+ case SCR0_MEM_THIRD:
+ memsize /= 3;
+ break;
+
+ case SCR0_MEM_FOURTH:
+ memsize /= 4;
+ break;
+
+ default:
+ // DBG_ASSERT_MSG(false, "Invalid memory usage specified by the bootstrap.\n");
+ break;
+ }
+
+ *adapter_mem_size = memsize;
+}
+
+/*
+ DESCRIPTION:: gets uos load offset from scratch register 2
+ PARAMETERS::
+ [in]void *mmio_va - virtual address to access MMIO registers
+ [out]uint32_t *uos_load_offset - offset at which uos will be loaded
+ RETURN_VALUE:: none
+*/
+void
+get_uos_loadoffset(uint8_t *mmio_va, uint32_t *uos_load_offset)
+{
+ uint32_t scratch2 = 0;
+
+ scratch2 = SBOX_READ(mmio_va, SBOX_SCRATCH2);
+ *uos_load_offset = SCRATCH2_DOWNLOAD_ADDR(scratch2);
+}
+
+/*
+ DESCRIPTION:: gets reserved size for uos
+ PARAMETERS::
+ [out]uint32_t *uos_reserve_size - reserved uos size
+ RETURN_VALUE:: none
+*/
+void
+get_uos_reserved_size(uint8_t* mmio_va, uint32_t adapter_memsize, uint32_t *uos_reserve_size)
+{
+ uint32_t reserve_size = 0;
+
+ // Only calculate if not explicitly specified by the user
+ reserve_size = (uint32_t)(adapter_memsize * UOS_RESERVE_PERCENT / 100);
+
+ // Make sure there is at least WINDOWS_RESERVE_SIZE_MIN bytes
+ reserve_size = GET_MIN(reserve_size, adapter_memsize - OS_RESERVE_SIZE_MIN);
+
+ // Keep in mind maximum uos reserve size is uint32_t, so we never overflow
+ reserve_size = GET_MIN(reserve_size, UOS_RESERVE_SIZE_MAX);
+ reserve_size = GET_MAX(reserve_size, UOS_RESERVE_SIZE_MIN);
+
+ // Always align uos reserve size to a page
+ reserve_size = (uint32_t)AlignLow(reserve_size, ((4) * 1024));
+
+ *uos_reserve_size = reserve_size;
+}
+
+/*
+ DESCRIPTION:: gets APIC ID from scratch register 2
+ PARAMETERS::
+ [in]void *mmio_va - virtual address to access MMIO registers
+ [out]uint32_t *apic_id - apic id
+ RETURN_VALUE:: none
+*/
+void
+get_apic_id(uint8_t *mmio_va, uint32_t *apic_id)
+{
+ uint32_t scratch2 = 0;
+
+ scratch2 = SBOX_READ(mmio_va, SBOX_SCRATCH2);
+ *apic_id = SCRATCH2_APIC_ID(scratch2);
+}
+
+/*
+ DESCRIPTION::program the PCI aperture as a contiguous window. (only supports upto 4GB memory)
+ PARAMETERS::
+ [in]mic_ctx_t *mic_ctx - mic ctx
+ [in]int gtt_index - beginning gtt entry index
+ [in]uint64_t phy_addr - physical address for PCI aperture
+ [in]uint32_t num_bytes - size of PCI aperture
+ RETURN_VALUE:: None
+ */
+void
+set_pci_aperture(mic_ctx_t *mic_ctx, uint32_t gtt_index, uint64_t phy_addr, uint32_t num_bytes)
+{
+ uint32_t num_pages;
+ uint32_t gtt_entry;
+ uint32_t i;
+
+ num_pages = ALIGN(num_bytes, PAGE_SIZE) >> PAGE_SHIFT;
+
+ for (i = 0; i < num_pages; i++) {
+
+ gtt_entry = ((uint32_t)(phy_addr >> PAGE_SHIFT) + i) << 1 | 0x1u;
+ GTT_WRITE(gtt_entry, mic_ctx->mmio.va, (gtt_index + i)*sizeof(gtt_entry));
+ }
+
+ // XPU_RACE_CONDITION:
+ // Writing GttTlbFlushReg DOES NOT flush all write transactions from SBOX to GDDR
+ // because GttTlbFlushReg is an SBOX register and transaction terminates in SBOX
+ // MMIO write must use MIC ringbus to be serializing.
+ // Writing GTT itself DOES serialize: GTT is in MMIO space, and write goes to the ringbus
+ // MemoryBarrier makes sure all writes make it to GDDR before tlbFlush write
+ smp_mb(); // FIXME: only needs SFENCE
+
+ // write any value to cause a flush
+ SBOX_WRITE(1, mic_ctx->mmio.va, SBOX_TLB_FLUSH);
+}
+
+/*
+ DESCRIPTION:: Programs a scratch register that the bootstrap reads to determine
+ how large is uOS image.
+ PARAMETERS::
+ [in]void *mmio_va - virtual address to mmio register,
+ [in]uint32_t uos_size - size of uos image
+ RETURN_VALUE:: none
+*/
+void
+set_uos_size(uint8_t *mmio_va, uint32_t uos_size)
+{
+ uint32_t scratch5;
+
+ scratch5 = uos_size;
+ // XPU_RACE_CONDITION: write to MMIO space is uncached and flushes WC buffers
+ SBOX_WRITE(scratch5, mmio_va, SBOX_SCRATCH5);
+}
+
+/*
+ DESCRIPTION:: Programs a scratch register that the uOS reads to determine how
+ much memory to reserve.
+ PARAMETERS::
+ [in]void *mmio_va - virtual address to mmio register,
+ [in]uint32_t uos_reserved_size - size of memory to be reserved by uos.
+ RETURN_VALUE:: none
+*/
+void
+set_uos_reserved_size(uint8_t *mmio_va, uint32_t uos_reserved_size)
+{
+ uint32_t scratch3;
+
+ scratch3 = uos_reserved_size;
+ // XPU_RACE_CONDITION: write to MMIO space is uncached and flushes WC buffers
+ SBOX_WRITE(scratch3, mmio_va, SBOX_SCRATCH3);
+}
+
+/*
+ DESCRIPTION:: .
+ PARAMETERS::
+ [in]uint32_t device_id - device ID,
+ RETURN_VALUE:: family type
+*/
+product_family_t
+get_product_family(uint32_t device_id)
+{
+ product_family_t product_family;
+
+ switch (device_id) {
+ case PCI_DEVICE_ABR_2249:
+ case PCI_DEVICE_ABR_224a:
+ product_family = FAMILY_ABR;
+ break;
+
+ case PCI_DEVICE_KNC_2250:
+ case PCI_DEVICE_KNC_2251:
+ case PCI_DEVICE_KNC_2252:
+ case PCI_DEVICE_KNC_2253:
+ case PCI_DEVICE_KNC_2254:
+ case PCI_DEVICE_KNC_2255:
+ case PCI_DEVICE_KNC_2256:
+ case PCI_DEVICE_KNC_2257:
+ case PCI_DEVICE_KNC_2258:
+ case PCI_DEVICE_KNC_2259:
+ case PCI_DEVICE_KNC_225a:
+ case PCI_DEVICE_KNC_225b:
+ case PCI_DEVICE_KNC_225c:
+ case PCI_DEVICE_KNC_225d:
+ case PCI_DEVICE_KNC_225e:
+ product_family = FAMILY_KNC;
+ break;
+
+ default:
+ pr_debug( "Invalid/Unknown device ID %d\r\n", device_id);
+ product_family = FAMILY_UNKNOWN;
+ break;
+ }
+
+ return product_family;
+}
+
+/*
+ DESCRIPTION:: loads uos image at given path into gddr
+ PARAMETERS::
+ [in]mic_ctx_t *mic_ctx - mic context
+ [in]imgname - file path for uos file to be loaded
+ [out]uos_size - size of uos image
+ */
+int
+load_uos_into_gddr(mic_ctx_t *mic_ctx, char *imgname, uint32_t* uos_size, uint64_t *uos_cmd_offset)
+{
+ void *aperture_va;
+ uint8_t *mmio_va;
+ uint32_t apic_id = 0;
+ uint32_t uos_load_offset = 0;
+ uint32_t adapter_memsize = 0;
+ int status = 0;
+
+ aperture_va = mic_ctx->aper.va;
+ mmio_va = mic_ctx->mmio.va;
+
+ if (mic_ctx->state != MIC_BOOT) {
+ printk("Not in booting state\n");
+ return -EPERM;
+ }
+
+ status = mic_get_file_size(imgname, uos_size);
+
+ if (status) {
+ mic_ctx->state = MIC_BOOTFAIL;
+ printk("Linux image not found at %s , status returned %d\n", imgname, status);
+ return status;
+ }
+
+ get_uos_loadoffset(mmio_va, &uos_load_offset);
+ // Determine the uOS reserve size after we have the m_pXpu interface
+ get_adapter_memsize(mmio_va, &adapter_memsize);
+
+ get_apic_id(mmio_va, &apic_id);
+ // store apic_id in adapter context for later use
+ mic_ctx->apic_id = apic_id;
+
+ if (mic_ctx->bi_family == FAMILY_ABR){
+ // Program the PCI aperture as a contiguous window
+ // Need an extra page to provide enough buffer space for command line arguments.
+ set_pci_aperture(mic_ctx, 0, uos_load_offset, *uos_size + PAGE_SIZE);
+ uos_load_offset = 0;
+ }
+
+ // transfer uOs image file to gddr
+ status = mic_load_file(imgname, ((uint8_t*)aperture_va) + uos_load_offset, *uos_size);
+
+ // for the emulator we want to skip "downloading" the file
+ *uos_cmd_offset = (uint64_t)uos_load_offset + *uos_size;
+
+ // This only applies to KNF bootstrap, it is NOT needed for KNC
+ if (mic_ctx->bi_family == FAMILY_ABR) {
+ // clear UOS load offset register after uOS was uploaded
+ SBOX_WRITE(0, mmio_va, SBOX_SCRATCH2);
+ SBOX_READ(mmio_va, SBOX_SCRATCH2);
+ }
+
+ return status;
+}
+
+/*
+ DESCRIPTION:: loads uos initramfs image at given path into gddr for KNC.
+ PARAMETERS::
+ [in]mic_ctx_t *mic_ctx - mic context
+ [in]initramfsname - file path for uos initramfs file to be loaded
+ */
+int
+load_initramfs(mic_ctx_t *mic_ctx, char *initramfsname, uint32_t *initramfs_image, uint32_t *initramfs_size)
+{
+ uint8_t *aperture_va;
+ uint8_t *mmio_va;
+ uint32_t apic_id = 0;
+ uint32_t uos_load_offset = 0;
+ uint32_t file_load_offset = 0;
+ uint32_t adapter_memsize = 0;
+ uint32_t file_size = 0;
+ int status = 0;
+ uint32_t *ramfs_addr_ptr;
+
+ aperture_va = mic_ctx->aper.va;
+ mmio_va = mic_ctx->mmio.va;
+
+ if (mic_ctx->state != MIC_BOOT) {
+ printk("Not in booting state\n");
+ return -EPERM;
+ }
+
+ status = mic_get_file_size(initramfsname, &file_size);
+
+ if (status) {
+ mic_ctx->state = MIC_BOOTFAIL;
+ printk("Init ram disk image not found at %s , status returned %d\n", initramfsname, status);
+ return status;
+ }
+
+ get_uos_loadoffset(mmio_va, &uos_load_offset);
+ file_load_offset = uos_load_offset << 1; /* Place initramfs higher than kernel; 128MB is ok */
+
+ *initramfs_size = file_size;
+ *initramfs_image = file_load_offset;
+
+ // Determine the uOS reserve size after we have the m_pXpu interface
+ get_adapter_memsize(mmio_va, &adapter_memsize);
+ get_apic_id(mmio_va, &apic_id);
+
+ // store apic_id in adapter context for later use
+ mic_ctx->apic_id = apic_id;
+
+ // transfer uOs image file to gddr
+ status = mic_load_file(initramfsname, aperture_va + file_load_offset, file_size);
+
+ // write the initramfs load address and size to the fields in the kernel header
+ ramfs_addr_ptr = (uint32_t *)(aperture_va + uos_load_offset + 0x218);
+ *ramfs_addr_ptr = file_load_offset;
+ ramfs_addr_ptr = (uint32_t *)(aperture_va + uos_load_offset + 0x21c);
+ *ramfs_addr_ptr = *initramfs_size;
+
+ return status;
+}
+
+struct tmpqp {
+ uint64_t ep;
+ uint64_t magic;
+};
+
+int
+load_command_line(mic_ctx_t *mic_ctx, uint64_t uos_cmd_offset)
+{
+ void *cmd_line_va = mic_ctx->aper.va + uos_cmd_offset;
+ uint32_t cmdlen = 0;
+ char *buf = NULL;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34)) || defined(RHEL_RELEASE_CODE)
+ struct board_info *bi = mic_ctx->bd_info;
+#endif
+
+#ifdef USE_VCONSOLE
+ micvcons_t *vcons = &mic_ctx->bi_vcons;
+ dma_addr_t vc_hdr_dma_addr = 0;
+#endif
+
+ /*
+ * mic_ctx->boot_mem will also be set in IOCTL to boot the card in restricted memory
+ * FIXME::This code is added to keep the backward compatibility with IOCTLs
+ */
+ if (mic_ctx->bi_family == FAMILY_KNC)
+ if (mic_ctx->boot_mem == 0 || mic_ctx->boot_mem > mic_ctx->aper.len >> 20)
+ mic_ctx->boot_mem = (uint32_t)(mic_ctx->aper.len >> 20);
+ if (!(buf = kzalloc(MIC_CMDLINE_BUFSIZE, GFP_KERNEL))) {
+ printk(KERN_ERR "failed to allocate %d bytes for uOS command line\n",
+ MIC_CMDLINE_BUFSIZE);
+ return -ENOMEM;
+ }
+
+ cmdlen = snprintf(buf, MIC_CMDLINE_BUFSIZE, "card=%d vnet=%s scif_id=%d scif_addr=0x%llx",
+ mic_ctx->bi_id, mic_vnet_modes[mic_vnet_mode],
+ mic_ctx->bi_id + 1, mic_ctx->bi_scif.si_pa);
+
+ if (mic_vnet_mode == VNET_MODE_DMA) {
+ struct micvnet_info *vnet_info = mic_ctx->bi_vethinfo;
+ cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen,
+ " vnet_addr=0x%llx", vnet_info->vi_rp_phys);
+ }
+
+#ifdef USE_VCONSOLE
+ if (vcons->dc_enabled)
+ vc_hdr_dma_addr = vcons->dc_hdr_dma_addr;
+
+ cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen,
+ " vcons_hdr_addr=0x%llx", vc_hdr_dma_addr);
+#endif
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34)) || defined(RHEL_RELEASE_CODE)
+ cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen, " virtio_addr=0x%llx",
+ mic_ctx_map_single(mic_ctx, bi->bi_virtio, sizeof(struct vb_shared)));
+#endif
+
+ if (mic_ctx->boot_mem)
+ cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen,
+ " mem=%dM", mic_ctx->boot_mem);
+ mic_ctx->boot_mem = 0;
+
+ if (mic_ctx->ramoops_size)
+ cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen,
+ " ramoops_size=%d ramoops_addr=0x%llx",
+ mic_ctx->ramoops_size, mic_ctx->ramoops_pa[0]);
+
+ cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen,
+ " p2p=%d p2p_proxy=%d", mic_p2p_enable, mic_p2p_proxy_enable);
+
+ if (mic_ctx->bi_family == FAMILY_KNC)
+ cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen,
+ " etc_comp=%lld", mic_ctx->etc_comp);
+
+ cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen,
+ " reg_cache=%d", mic_reg_cache_enable);
+ cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen,
+ " ulimit=%d", mic_ulimit_check);
+ cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen,
+ " huge_page=%d", mic_huge_page_enable);
+ if (mic_crash_dump_enabled)
+ cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen,
+ " crashkernel=1M@80M");
+ /*
+ * Limitations in the Intel Jaketown and Ivytown platforms require SCIF
+ * to proxy P2P DMA read transfers in order to convert them into a P2P DMA
+ * write for better performance. The SCIF module on MIC needs the
+ * numa node the MIC is connected to on the host to make decisions
+ * about whether to proxy P2P DMA reads or not based on whether the two MIC
+ * devices are connected to the same QPI/socket/numa node or not.
+ * The assumption here is that a socket/QPI will have a unique
+ * numa node number.
+ */
+ pr_debug("CPU family = %d, CPU model = %d\n", boot_cpu_data.x86, boot_cpu_data.x86_model);
+
+ if (mic_p2p_proxy_enable && (boot_cpu_data.x86==6) &&
+ (boot_cpu_data.x86_model == 45 || boot_cpu_data.x86_model == 62)) {
+ int numa_node = dev_to_node(&mic_ctx->bi_pdev->dev);
+ if (-1 != numa_node) {
+ if (boot_cpu_data.x86_model == 45)
+ ms_info.mi_proxy_dma_threshold = SCIF_PROXY_DMA_THRESHOLD_JKT;
+ if (boot_cpu_data.x86_model == 62)
+ ms_info.mi_proxy_dma_threshold = SCIF_PROXY_DMA_THRESHOLD_IVT;
+ cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen,
+ " numa_node=%d", numa_node);
+ cmdlen += snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen,
+ " p2p_proxy_thresh=%lld", ms_info.mi_proxy_dma_threshold);
+ }
+ }
+
+ if (mic_ctx->sysfs_info.cmdline != NULL)
+ snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen,
+ " %s", mic_ctx->sysfs_info.cmdline);
+ else
+ snprintf(buf + cmdlen, MIC_CMDLINE_BUFSIZE - cmdlen,
+ " hostname=mic%d ipaddr=171.31.%d.2 quiet console=ttyS0,115200n8",
+ mic_ctx->bi_id, mic_ctx->bi_id + 1);
+
+ memcpy_toio(cmd_line_va, buf, strlen(buf) + 1);
+
+ if (mic_ctx->sysfs_info.kernel_cmdline != NULL)
+ kfree(mic_ctx->sysfs_info.kernel_cmdline);
+
+ if ((mic_ctx->sysfs_info.kernel_cmdline = kmalloc(strlen(buf) + 1, GFP_KERNEL)) != NULL)
+ strcpy(mic_ctx->sysfs_info.kernel_cmdline, buf);
+
+ kfree(buf);
+ return 0;
+}
+
+/*
+ DESCRIPTION:: method responsible for programming scratch register with uos image size
+ and notifying bootstrap to start booting uos
+ PARAMETERS::
+ [in]mic_ctx_t *mic_ctx - mic context
+ [in]uint32_t uos_size - size of uos image
+ */
+int
+notify_uosboot(mic_ctx_t *mic_ctx, uint32_t uos_size)
+{
+ int status = 0;
+ uint32_t adapter_memsize = 0;
+ uint32_t uos_reserved_size = 0;
+ uint8_t* mmio_va = mic_ctx->mmio.va;
+
+ // Program the register with uOS image size for bootstrap
+ set_uos_size(mmio_va, uos_size);
+
+ get_adapter_memsize(mmio_va, &adapter_memsize);
+
+ // Program the register to inform the uOS of how much space to reserve
+ get_uos_reserved_size(mmio_va, adapter_memsize, &uos_reserved_size);
+ set_uos_reserved_size(mmio_va, uos_reserved_size);
+
+ mic_send_bootstrap_intr(mic_ctx);
+
+ return status;
+}
+
+/*
+ DESCRIPTION :: boots Linux OS on the card
+ PARAMETERS ::
+ [in]mic_ctx_t *mic_ctx - mic context
+ [in]char *imgname - file path for uos image to be loaded on the card
+ RETURN_VALUE:: 0 if successful, non-zero if failure
+*/
+int
+boot_linux_uos(mic_ctx_t *mic_ctx, char *imgname, char *initramfsname)
+{
+ int status = 0;
+ uint32_t uos_size = 0;
+ uint64_t uos_cmd_offset = 0;
+ uint32_t initramfs_image = 0;
+ uint32_t initramfs_size = 0;
+
+ printk("MIC %d Booting\n", mic_ctx->bi_id);
+
+ if (mic_ctx->state != MIC_BOOT) {
+ printk(KERN_ERR "MIC %d is not in offline mode\n", mic_ctx->bi_id);
+ return -EPERM;
+ }
+
+ //loads uos image at given path into gddr
+ if ((status = load_uos_into_gddr(mic_ctx, imgname, &uos_size, &uos_cmd_offset)) != 0) {
+ printk("Cannot load uos in gddr\n");
+ return status;
+ }
+
+ if (initramfsname && (status = load_initramfs(mic_ctx, initramfsname, &initramfs_image, &initramfs_size)) != 0) {
+ printk("Cannot load initramfs in gddr\n");
+ return status;
+ }
+
+ status = load_command_line(mic_ctx, uos_cmd_offset);
+
+ //program scratch register with uos image size and notify bootstrap
+ status = notify_uosboot(mic_ctx, uos_size);
+
+ return status;
+}
+
+/*
+ DESCRIPTION :: boots Maintenance mode handler on the card
+ PARAMETERS ::
+ [in]mic_ctx_t *mic_ctx - mic context
+ [in]char *imgname - file path for uos image to be loaded on the card
+ RETURN_VALUE:: 0 if successful, non-zero if failure
+*/
+int boot_micdev_app(mic_ctx_t *mic_ctx, char *imgname)
+{
+ int status = 0;
+ uint32_t uos_size = 0;
+ uint8_t *mmio_va = 0;
+ uint64_t uos_cmd_offset = 0;
+ int32_t temp_scratch2 = 0;
+
+ printk("MIC %d Booting\n", mic_ctx->bi_id);
+ mmio_va = mic_ctx->mmio.va;
+ status = load_uos_into_gddr(mic_ctx, imgname, &uos_size, &uos_cmd_offset);
+ if(status) {
+ printk("Cannot load uos in gddr\n");
+ goto exit;
+ }
+
+ temp_scratch2 = SBOX_READ(mmio_va, SBOX_SCRATCH2);
+ /* clear download bit */
+ temp_scratch2 = SCRATCH2_CLEAR_DOWNLOAD_STATUS(temp_scratch2);
+ SBOX_WRITE(temp_scratch2, mmio_va, SBOX_SCRATCH2);
+
+ //program scratch register with uos image size and notify bootstrap
+ status = notify_uosboot(mic_ctx, uos_size);
+ if(status)
+ goto exit;
+ status = wait_for_bootstrap(mmio_va);
+exit:
+ if(status) {
+ mic_setstate(mic_ctx, MIC_BOOTFAIL);
+ } else {
+ mic_setstate(mic_ctx, MIC_ONLINE);
+ mic_ctx->boot_count++;
+ printk("ELF booted succesfully\n");
+ ;
+ }
+ return status;
+}
+
+/* Perform hardware reset of the device */
+void
+reset_timer(unsigned long arg)
+{
+ mic_ctx_t *mic_ctx = (mic_ctx_t *)arg;
+ uint32_t scratch2 = 0;
+ uint32_t postcode = mic_getpostcode(mic_ctx);
+
+ printk("mic%d: Resetting (Post Code %c%c)\n", mic_ctx->bi_id,
+ postcode & 0xff, (postcode >> 8) & 0xff);
+ mic_ctx->reset_count++;
+
+ /* Assuming that the bootstrap takes around 90 seconds to reset,
+ * we fail after 300 seconds, thus allowing 3 attempts to reset
+ */
+ if (mic_ctx->reset_count == RESET_FAIL_TIME ||
+ !postcode || 0xffffffff == postcode || mic_ctx->state == MIC_RESETFAIL) {
+ mic_ctx->reset_count = 0;
+ mic_setstate(mic_ctx, MIC_RESETFAIL);
+ wake_up(&mic_ctx->resetwq);
+ printk("MIC %d RESETFAIL postcode %c%c %d\n", mic_ctx->bi_id,
+ postcode & 0xff, (postcode >> 8) & 0xff, postcode);
+ return;
+ }
+
+ /* check for F2 or F4 error codes from bootstrap */
+ if ((postcode == RESET_FAILED_F2) || (postcode == RESET_FAILED_F4)) {
+ if (mic_ctx->resetworkq) {
+ queue_work(mic_ctx->resetworkq, &mic_ctx->resetwork);
+ } else {
+ mic_ctx->reset_count = 0;
+ mic_setstate(mic_ctx, MIC_RESETFAIL);
+ wake_up(&mic_ctx->resetwq);
+ return;
+ }
+ }
+
+ /* checking if bootstrap is ready or still resetting */
+ scratch2 = SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH2);
+ if (SCRATCH2_DOWNLOAD_STATUS(scratch2)) {
+ mic_ctx->boot_start = 0;
+ mic_setstate(mic_ctx, MIC_READY);
+
+ if (mic_ctx->msie)
+ mic_enable_msi_interrupts(mic_ctx);
+ mic_enable_interrupts(mic_ctx);
+ mic_smpt_restore(mic_ctx);
+ micscif_start(mic_ctx);
+
+ wake_up(&mic_ctx->resetwq);
+ mic_ctx->reset_count = 0;
+
+ return;
+ }
+
+ mic_ctx->boot_timer.function = reset_timer;
+ mic_ctx->boot_timer.data = (unsigned long)mic_ctx;
+ mic_ctx->boot_timer.expires = jiffies + HZ;
+
+ add_timer(&mic_ctx->boot_timer);
+}
+
+void
+adapter_wait_reset(mic_ctx_t *mic_ctx)
+{
+ mic_ctx->boot_timer.function = reset_timer;
+ mic_ctx->boot_timer.data = (unsigned long)mic_ctx;
+ mic_ctx->boot_timer.expires = jiffies + HZ;
+ mic_ctx->boot_start = jiffies;
+
+ add_timer(&mic_ctx->boot_timer);
+}
+
+void
+adapter_reset(mic_ctx_t *mic_ctx, int wait_reset, int reattempt)
+{
+ uint32_t resetReg;
+ mutex_lock(&mic_ctx->state_lock);
+ /* TODO: check state for lost node as well once design is done */
+ if ((mic_ctx->state == MIC_RESET || mic_ctx->state == MIC_READY) && (reattempt == 0)) {
+ if (wait_reset == 0) {
+ mic_setstate(mic_ctx, MIC_INVALID);
+ del_timer_sync(&mic_ctx->boot_timer);
+ mutex_unlock(&mic_ctx->state_lock);
+ return;
+ }
+ mutex_unlock(&mic_ctx->state_lock);
+ return;
+ }
+
+ mic_setstate(mic_ctx, MIC_RESET);
+
+ mutex_unlock(&mic_ctx->state_lock);
+
+ del_timer_sync(&mic_ctx->boot_timer);
+
+ //Write 0 to uos download status otherwise we might continue booting
+ //before reset has completed...
+ SBOX_WRITE(0, mic_ctx->mmio.va, SBOX_SCRATCH2);
+
+ // Virtual network link value should be 0 before reset
+ SBOX_WRITE(0, mic_ctx->mmio.va, SBOX_SCRATCH14);
+
+ // Data from Doorbell1 about restart/shutdown should be 0 before reset
+ SBOX_WRITE(0, mic_ctx->mmio.va, SBOX_SDBIC1);
+
+ //This will trigger reset
+ resetReg = SBOX_READ(mic_ctx->mmio.va, SBOX_RGCR);
+ resetReg |= 0x1;
+ SBOX_WRITE(resetReg, mic_ctx->mmio.va, SBOX_RGCR);
+
+ /* At least of KNF it seems we really want to delay at least 1 second */
+ /* after touching reset to prevent a lot of problems. */
+ msleep(1000);
+
+ if (!wait_reset) {
+ return;
+ }
+
+ adapter_wait_reset(mic_ctx);
+
+}
+
+void ramoops_flip(mic_ctx_t *mic_ctx);
+
+int
+adapter_shutdown_device(mic_ctx_t *mic_ctx)
+{
+ ;
+
+ if (micpm_get_reference(mic_ctx, true))
+ return 0;
+
+ mutex_lock(&mic_ctx->state_lock);
+ if (mic_ctx->state == MIC_ONLINE) {
+ mic_setstate(mic_ctx, MIC_SHUTDOWN);
+
+ /*
+ * Writing to SBOX RDMASR0 will generate an interrupt
+ * on the uOS which will initiate orderly shutdown.
+ */
+ mic_send_sht_intr(mic_ctx);
+ }
+ mutex_unlock(&mic_ctx->state_lock);
+
+ micpm_put_reference(mic_ctx);
+ return 0;
+}
+
+int
+adapter_stop_device(mic_ctx_t *mic_ctx, int wait_reset, int reattempt)
+{
+ ;
+
+ micvcons_stop(mic_ctx);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34) || \
+ defined(RHEL_RELEASE_CODE)
+ mic_vhost_blk_stop(mic_ctx->bd_info);
+#endif
+ micveth_stop(mic_ctx);
+
+ micpm_stop(mic_ctx);
+ micscif_stop(mic_ctx);
+ vmcore_remove(mic_ctx);
+ close_dma_device(mic_ctx->bi_id + 1, &mic_ctx->dma_handle);
+ ramoops_flip(mic_ctx);
+
+ /* Calling adapter_reset after issuing Host shutdown/reboot
+ * leads to randon NMIs. These are not rleated to any Card in
+ * specific but occurs on the PCI bridge. */
+ if ((system_state == SYSTEM_POWER_OFF) ||
+ (system_state == SYSTEM_RESTART) ||
+ (system_state == SYSTEM_HALT))
+ return 0;
+ adapter_reset(mic_ctx, wait_reset, reattempt);
+
+ return 0;
+}
+
+static void
+destroy_reset_workqueue(mic_ctx_t *mic_ctx)
+{
+ struct workqueue_struct *tempworkq;
+ tempworkq = mic_ctx->resetworkq;
+ mic_ctx->resetworkq = NULL;
+ destroy_workqueue(tempworkq);
+ del_timer_sync(&mic_ctx->boot_timer);
+}
+
+int
+adapter_remove(mic_ctx_t *mic_ctx)
+{
+
+#ifdef USE_VCONSOLE
+ if (mic_ctx->bi_vcons.dc_hdr_virt) {
+ mic_ctx_unmap_single(mic_ctx, mic_ctx->bi_vcons.dc_hdr_dma_addr,
+ sizeof(struct vcons_buf));
+ kfree(mic_ctx->bi_vcons.dc_hdr_virt);
+ mic_ctx->bi_vcons.dc_hdr_virt = NULL;
+ }
+
+ if (mic_ctx->bi_vcons.dc_buf_virt) {
+ mic_ctx_unmap_single(mic_ctx, mic_ctx->bi_vcons.dc_dma_addr,
+ MICVCONS_BUF_SIZE);
+ free_pages((uint64_t)mic_ctx->bi_vcons.dc_buf_virt, 0);
+ mic_ctx->bi_vcons.dc_buf_virt = NULL;
+ }
+#endif
+
+ mic_psmi_uninit(mic_ctx);
+ micpm_remove(mic_ctx);
+ micscif_remove(mic_ctx);
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34)) || defined(RHEL_RELEASE_CODE)
+ mic_vhost_blk_remove(mic_ctx->bd_info);
+#endif
+ micveth_remove(mic_ctx);
+ mic_unreg_irqhandler(mic_ctx, 0x1, "MIC SHUTDOWN DoorBell 1");
+
+ ramoops_remove(mic_ctx);
+ vmcore_remove(mic_ctx);
+ mic_smpt_uninit(mic_ctx);
+ /* Make sure that no reset timer is running after the workqueue is destroyed */
+ destroy_reset_workqueue(mic_ctx);
+
+ if (mic_ctx->mmio.va) {
+ iounmap((void *)mic_ctx->mmio.va);
+ mic_ctx->mmio.va = 0;
+ }
+
+ if (mic_ctx->aper.va) {
+ iounmap((void *)mic_ctx->aper.va);
+ mic_ctx->aper.va = 0;
+ }
+
+
+ return 0;
+}
+
+#define MIC_MAX_BOOT_TIME 180 // Maximum number of seconds to wait for boot to complete
+
+static void
+online_timer(unsigned long arg)
+{
+ mic_ctx_t *mic_ctx = (mic_ctx_t *)arg;
+ uint64_t delay = (jiffies - mic_ctx->boot_start) / HZ;
+
+ if (mic_ctx->state == MIC_ONLINE)
+ return;
+
+ if (delay > MIC_MAX_BOOT_TIME) {
+ printk("Fail booting MIC %d. Wait time execeed %d seconds\n", mic_ctx->bi_id, MIC_MAX_BOOT_TIME);
+ mic_ctx->state = MIC_BOOTFAIL;
+ return;
+ }
+
+ mic_ctx->boot_timer.function = online_timer;
+ mic_ctx->boot_timer.data = (unsigned long)mic_ctx;
+ mic_ctx->boot_timer.expires = jiffies + HZ;
+ add_timer(&mic_ctx->boot_timer);
+
+ if (!(delay % 5))
+ printk("Waiting for MIC %d boot %lld\n", mic_ctx->bi_id, delay);
+}
+
+static void
+boot_timer(unsigned long arg)
+{
+ mic_ctx_t *mic_ctx = (mic_ctx_t *)arg;
+ struct micvnet_info *vnet_info = (struct micvnet_info *) mic_ctx->bi_vethinfo;
+ uint64_t delay = (jiffies - mic_ctx->boot_start) / HZ;
+ bool timer_restart = false;
+
+ if ((mic_ctx->state != MIC_BOOT) && (mic_ctx->state != MIC_ONLINE)) {
+ return;
+ }
+
+ if (delay > MIC_MAX_BOOT_TIME) {
+ printk("Fail booting MIC %d. Wait time execeed %d seconds\n", mic_ctx->bi_id, MIC_MAX_BOOT_TIME);
+ mic_ctx->state = MIC_BOOTFAIL;
+ return;
+ }
+
+ if (!(delay % 5))
+ printk("Waiting for MIC %d boot %lld\n", mic_ctx->bi_id, delay);
+
+ if (mic_vnet_mode != VNET_MODE_DMA)
+ timer_restart = (SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH14) == 0)?
+ true : false;
+ else if (atomic_read(&vnet_info->vi_state) != MICVNET_STATE_LINKUP)
+ timer_restart = (mic_ctx->state != MIC_ONLINE)? true: false;
+
+ if (timer_restart) {
+ mic_ctx->boot_timer.function = boot_timer;
+ mic_ctx->boot_timer.data = (unsigned long)mic_ctx;
+ mic_ctx->boot_timer.expires = jiffies + HZ;
+
+ add_timer(&mic_ctx->boot_timer);
+ return;
+ }
+
+ mic_ctx->boot_timer.function = online_timer;
+ mic_ctx->boot_timer.data = (unsigned long)mic_ctx;
+ mic_ctx->boot_timer.expires = jiffies + HZ;
+ add_timer(&mic_ctx->boot_timer);
+
+ printk("MIC %d Network link is up\n", mic_ctx->bi_id);
+ schedule_work(&mic_ctx->boot_ws);
+}
+
+void
+post_boot_startup(struct work_struct *work)
+{
+
+ mic_ctx_t *mic_ctx
+ = container_of(work, mic_ctx_t, boot_ws);
+
+ if (micpm_get_reference(mic_ctx, true) != 0)
+ return;
+
+ // We should only enable DMA after uos is booted
+ BUG_ON(open_dma_device(mic_ctx->bi_id+1,
+ mic_ctx->mmio.va + HOST_SBOX_BASE_ADDRESS,
+ &mic_ctx->dma_handle));
+ if (micveth_start(mic_ctx))
+ printk(KERN_ERR "%s: micveth_start failed\n", __FUNCTION__);
+ micpm_put_reference(mic_ctx);
+
+}
+
+void
+attempt_reset(struct work_struct *work)
+{
+ mic_ctx_t *mic_ctx
+ = container_of(work, mic_ctx_t, resetwork);
+ printk("Reattempting reset after F2/F4 failure\n");
+ adapter_reset(mic_ctx, RESET_WAIT, RESET_REATTEMPT);
+}
+
+static void
+ioremap_work(struct work_struct *work)
+{
+ mic_ctx_t *mic_ctx
+ = container_of(work, mic_ctx_t, ioremapwork);
+ mic_ctx->aper.va = ioremap_wc(mic_ctx->aper.pa, mic_ctx->aper.len);
+ if (mic_ctx->aper.va == NULL) {
+ printk(KERN_ERR "mic %d: failed to map aperture space\n", mic_ctx->bi_id);
+ mutex_lock(&mic_ctx->state_lock);
+ mic_setstate(mic_ctx, MIC_RESETFAIL);
+ mutex_unlock(&mic_ctx->state_lock);
+ }
+ wake_up(&mic_ctx->ioremapwq);
+}
+
+int
+adapter_post_boot_device(mic_ctx_t *mic_ctx)
+{
+ mic_ctx->boot_timer.function = boot_timer;
+ mic_ctx->boot_timer.data = (unsigned long)mic_ctx;
+ mic_ctx->boot_timer.expires = jiffies + HZ;
+ mic_ctx->boot_start = jiffies;
+
+ add_timer(&mic_ctx->boot_timer);
+ return 0;
+}
+
+int
+mic_shutdown_host_doorbell_intr_handler(mic_ctx_t *mic_ctx, int doorbell)
+{
+ struct micscif_dev *dev = &scif_dev[mic_get_scifnode_id(mic_ctx)];
+ mic_ctx->sdbic1 = SBOX_READ(mic_ctx->mmio.va, SBOX_SDBIC1);
+ SBOX_WRITE(0x0, mic_ctx->mmio.va, SBOX_SDBIC1);
+ if (mic_ctx->sdbic1)
+ queue_delayed_work(dev->sd_ln_wq,
+ &dev->sd_watchdog_work, 0);
+ return 0;
+}
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+static int
+ramoops_proc_show(struct seq_file *m, void *data)
+{
+ uint64_t id = ((uint64_t)data) & 0xffffffff;
+ uint64_t entry = ((uint64_t)data) >> 32;
+ struct list_head *pos, *tmpq;
+ bd_info_t *bd = NULL;
+ mic_ctx_t *mic_ctx = NULL;
+ char *record;
+ char *end;
+ int size = 0;
+ int l = 0;
+ char *output;
+ unsigned long flags;
+
+ list_for_each_safe(pos, tmpq, &mic_data.dd_bdlist) {
+ bd = list_entry(pos, bd_info_t, bi_list);
+ mic_ctx = &bd->bi_ctx;
+ if (mic_ctx->bi_id == id)
+ break;
+ }
+
+ if (mic_ctx == NULL)
+ return 0;
+
+ spin_lock_irqsave(&mic_ctx->ramoops_lock, flags);
+
+ record = mic_ctx->ramoops_va[entry];
+ if (record == NULL) {
+ spin_unlock_irqrestore(&mic_ctx->ramoops_lock, flags);
+ return -EEXIST;
+ }
+
+ size = mic_ctx->ramoops_size;
+ end = record + size;
+
+ if ((output = kzalloc(size, GFP_ATOMIC)) == NULL) {
+ spin_unlock_irqrestore(&mic_ctx->ramoops_lock, flags);
+ return -ENOMEM;
+ }
+
+ l += scnprintf(output, size, "%s", record);
+
+ spin_unlock_irqrestore(&mic_ctx->ramoops_lock, flags);
+
+ seq_printf(m, "%s", output);
+ return 0;
+}
+
+static int
+ramoops_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ramoops_proc_show, NULL);
+}
+
+struct file_operations ramoops_proc_fops = {
+ .open = ramoops_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#else // LINUX VERSION
+static int
+ramoops_read(char *buf, char **start, off_t offset, int len, int *eof, void *data)
+{
+ uint64_t id = ((uint64_t)data) & 0xffffffff;
+ uint64_t entry = ((uint64_t)data) >> 32;
+ struct list_head *pos, *tmpq;
+ bd_info_t *bd = NULL;
+ mic_ctx_t *mic_ctx = NULL;
+ char *record;
+ char *end;
+ int size = 0;
+ int l = 0;
+ int left_to_read;
+ char *output;
+ unsigned long flags;
+
+ list_for_each_safe(pos, tmpq, &mic_data.dd_bdlist) {
+ bd = list_entry(pos, bd_info_t, bi_list);
+ mic_ctx = &bd->bi_ctx;
+ if (mic_ctx->bi_id == id)
+ break;
+ }
+
+ if (mic_ctx == NULL)
+ return 0;
+
+ spin_lock_irqsave(&mic_ctx->ramoops_lock, flags);
+
+ record = mic_ctx->ramoops_va[entry];
+ if (record == NULL) {
+ spin_unlock_irqrestore(&mic_ctx->ramoops_lock, flags);
+ *eof = 1;
+ return 0;
+ }
+
+ size = mic_ctx->ramoops_size;
+ end = record + size;
+
+ if ((output = kzalloc(size, GFP_ATOMIC)) == NULL) {
+ spin_unlock_irqrestore(&mic_ctx->ramoops_lock, flags);
+ return -ENOMEM;
+ }
+
+ l += scnprintf(output, size, "%s", record);
+
+ spin_unlock_irqrestore(&mic_ctx->ramoops_lock, flags);
+
+ left_to_read = l - offset;
+ if (left_to_read < 0)
+ left_to_read = 0;
+ if (left_to_read == 0)
+ *eof = 1;
+
+ left_to_read = min(len, left_to_read);
+ memcpy(buf, output + offset, left_to_read);
+ kfree(output);
+ *start = buf;
+ return left_to_read;
+}
+#endif // LINUX VERSION
+
+int
+set_ramoops_pa(mic_ctx_t *mic_ctx)
+{
+ if (mic_ctx->ramoops_pa[0] == 0L) {
+ kfree(mic_ctx->ramoops_va[0]);
+ mic_ctx->ramoops_size = 0;
+ mic_ctx->ramoops_va[0] = NULL;
+ return 1;
+ }
+ return 0;
+}
+
+int ramoops_count = 4;
+
+void
+ramoops_probe(mic_ctx_t *mic_ctx)
+{
+ char name[64];
+
+ mic_ctx->ramoops_size = ramoops_count * PAGE_SIZE;
+ if ((mic_ctx->ramoops_va[0] = kzalloc(mic_ctx->ramoops_size, GFP_KERNEL)) != NULL) {
+ spin_lock_init(&mic_ctx->ramoops_lock);
+ mic_ctx->ramoops_va[1] = NULL;
+
+ mic_ctx->ramoops_pa[0] = mic_ctx_map_single(mic_ctx, mic_ctx->ramoops_va[0],
+ mic_ctx->ramoops_size);
+ if (set_ramoops_pa(mic_ctx))
+ return;
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ snprintf(name, 64, "mic%d", mic_ctx->bi_id);
+ proc_create_data(name, 0444, ramoops_dir, &ramoops_proc_fops,
+ (void *)(long)mic_ctx->bi_id);
+
+ snprintf(name, 64, "mic%d_prev", mic_ctx->bi_id);
+ proc_create_data(name, 0444, ramoops_dir, &ramoops_proc_fops,
+ (void *)((long)mic_ctx->bi_id | (1L << 32)));
+#else // LINUX VERSION
+ snprintf(name, 64, "mic%d", mic_ctx->bi_id);
+ if (create_proc_read_entry(name, 0444, ramoops_dir, ramoops_read,
+ (void *)(long)mic_ctx->bi_id) == NULL)
+ printk("Failed to intialize /proc/mic_ramoops/%s\n", name);
+
+ snprintf(name, 64, "mic%d_prev", mic_ctx->bi_id);
+ if (create_proc_read_entry(name, 0444, ramoops_dir, ramoops_read,
+ (void *)((long)mic_ctx->bi_id | (1L << 32))) == NULL)
+ printk("Failed to intialize /proc/mic_ramoops/%s\n", name);
+#endif //LINUX VERSION
+ } else {
+ mic_ctx->ramoops_size = 0;
+ }
+}
+
+void
+ramoops_flip(mic_ctx_t *mic_ctx)
+{
+ unsigned long flags;
+
+ if (mic_ctx->ramoops_size == 0)
+ return;
+
+ spin_lock_irqsave(&mic_ctx->ramoops_lock, flags);
+ if (mic_ctx->ramoops_va[1] != NULL) {
+ mic_ctx_unmap_single(mic_ctx, mic_ctx->ramoops_pa[1], mic_ctx->ramoops_size);
+ kfree(mic_ctx->ramoops_va[1]);
+ }
+
+ mic_ctx->ramoops_pa[1] = mic_ctx->ramoops_pa[0];
+ mic_ctx->ramoops_va[1] = mic_ctx->ramoops_va[0];
+ if ((mic_ctx->ramoops_va[0] = kzalloc(mic_ctx->ramoops_size, GFP_ATOMIC)) != NULL) {
+ mic_ctx->ramoops_pa[0] = mic_ctx_map_single(mic_ctx, mic_ctx->ramoops_va[0],
+ mic_ctx->ramoops_size);
+ set_ramoops_pa(mic_ctx);
+ }
+ spin_unlock_irqrestore(&mic_ctx->ramoops_lock, flags);
+}
+
+int
+adapter_probe(mic_ctx_t *mic_ctx)
+{
+ int db;
+ uint32_t scratch13;
+ int32_t status = 0;
+
+ // Init the irq information
+ atomic_set(&mic_ctx->bi_irq.mi_received, 0);
+ spin_lock_init(&mic_ctx->bi_irq.mi_lock);
+ tasklet_init(&mic_ctx->bi_dpc, adapter_dpc, (unsigned long)&mic_ctx->bi_dpc);
+
+ for (db = 0; db < MIC_NUM_DB; db++) {
+ INIT_LIST_HEAD(&mic_ctx->bi_irq.mi_dblist[db]);
+ }
+
+ if (mic_ctx->msie)
+ mic_enable_msi_interrupts(mic_ctx);
+
+ scratch13 = SBOX_READ(mic_ctx->mmio.va, SBOX_SCRATCH13);
+ mic_ctx->bi_stepping = SCRATCH13_STEP_ID(scratch13);
+ mic_ctx->bi_substepping = SCRATCH13_SUB_STEP(scratch13);
+#ifdef MIC_IS_EMULATION
+ mic_ctx->bi_platform = PLATFORM_EMULATOR;
+#else
+ mic_ctx->bi_platform = SCRATCH13_PLATFORM_ID(scratch13);
+#endif
+
+ mic_enable_interrupts(mic_ctx);
+ if (micveth_probe(mic_ctx))
+ printk(KERN_ERR "%s: micveth_probe failed\n", __FUNCTION__);
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34)) || defined(RHEL_RELEASE_CODE)
+ if (mic_vhost_blk_probe(mic_ctx->bd_info))
+ printk(KERN_ERR "%s: mic_vhost_blk_probe failed\n", __FUNCTION__);
+#endif
+ micscif_probe(mic_ctx);
+ if(micpm_probe(mic_ctx))
+ printk(KERN_ERR "%s: micpm_probe failed\n", __FUNCTION__);
+
+ mic_reg_irqhandler(mic_ctx, 1, "MIC SHUTDOWN DoorBell 1",
+ mic_shutdown_host_doorbell_intr_handler);
+
+ ramoops_probe(mic_ctx);
+ if (status) {
+ printk("boot_linux_uos failed \n");
+ return status;
+ }
+
+ // We should only enable DMA after uos is booted
+ //mic_dma_lib_init(mic_ctx->mmio.va+HOST_SBOX_BASE_ADDRESS);
+
+ return status;
+}
+
+int
+adapter_start_device(mic_ctx_t *mic_ctx)
+{
+ int ret;
+
+ mutex_lock(&mic_ctx->state_lock);
+ if (mic_ctx->state == MIC_READY) {
+ mic_setstate(mic_ctx, MIC_BOOT);
+ } else {
+ mutex_unlock(&mic_ctx->state_lock);
+ /* TODO: Unknown state handling? */
+ printk(KERN_ERR "%s %d state %d??\n",
+ __func__, __LINE__, mic_ctx->state);
+ ret = -EINVAL;
+ goto exit;
+ }
+ mutex_unlock(&mic_ctx->state_lock);
+ mic_ctx->mode = MODE_LINUX;
+ ret = boot_linux_uos(mic_ctx, mic_ctx->image, mic_ctx->initramfs);
+ if (ret) {
+ printk(KERN_ERR "boot_linux_uos failed %d\n", ret);
+ goto exit;
+ }
+
+ ret = adapter_post_boot_device(mic_ctx);
+ if (ret) {
+ printk(KERN_ERR "adapter post boot failed %d\n", ret);
+ goto exit;
+ }
+
+ pr_debug("adapter started successfully\n");
+exit:
+ return ret;
+}
+
+int
+adapter_init_device(mic_ctx_t *mic_ctx)
+{
+#ifdef USE_VCONSOLE
+ struct vcons_buf *vcons_buf;
+#endif
+ uint32_t mmio_data_cc; /* mmio data from class code register */
+ uint32_t mmio_data_bar; /* mmio data from bar enable register */
+ uint32_t device_id;
+ int err = 0;
+
+ spin_lock_init(&mic_ctx->sysfs_lock);
+ mic_setstate(mic_ctx, MIC_RESET);
+ mic_ctx->mode = MODE_NONE;
+ mic_ctx->reset_count = 0;
+ mutex_init (&mic_ctx->state_lock);
+ init_waitqueue_head(&mic_ctx->resetwq);
+ init_waitqueue_head(&mic_ctx->ioremapwq);
+ init_timer(&mic_ctx->boot_timer);
+ if (!(mic_ctx->resetworkq = __mic_create_singlethread_workqueue("RESET WORK")))
+ return -ENOMEM;
+ if (!(mic_ctx->ioremapworkq = __mic_create_singlethread_workqueue("IOREMAP_WORK"))) {
+ err = -EINVAL;
+ goto destroy_reset_wq;
+ }
+ INIT_WORK(&mic_ctx->ioremapwork, ioremap_work);
+ INIT_WORK(&mic_ctx->boot_ws, post_boot_startup);
+ INIT_WORK(&mic_ctx->resetwork, attempt_reset);
+ atomic_set(&mic_ctx->gate_interrupt, 0);
+
+ device_id = mic_ctx->bi_pdev->device;
+ mic_ctx->bi_family = get_product_family(device_id);
+
+ if ((mic_ctx->mmio.va = ioremap_nocache(mic_ctx->mmio.pa,
+ mic_ctx->mmio.len)) == NULL) {
+ printk("mic %d: failed to map mmio space\n", mic_ctx->bi_id);
+ err = -ENOMEM;
+ goto destroy_remap_wq;
+ }
+
+ if (mic_ctx->aper.pa == 0) {
+ /*
+ * Read class code from SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8 register
+ * If the mode is zombie, then
+ * 1> Aperture is not available
+ * 2> Register 0x5CD4 is written to 0x00000002 to disable all BARs except MMIO
+ * 3> Register 0x5808 is written to 0xFF0000XX to set the class ID to a generic PCI device.
+ */
+ mmio_data_cc = SBOX_READ(mic_ctx->mmio.va, SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8);
+ mmio_data_cc = PCIE_CLASS_CODE(mmio_data_cc);
+ mmio_data_bar = SBOX_READ(mic_ctx->mmio.va, SBOX_PCIE_BAR_ENABLE);
+
+ if((mmio_data_cc == ZOMBIE_CLASS_CODE) && (mmio_data_bar == DISABLE_BAR)) {
+ mic_ctx->card_usage_mode = USAGE_MODE_ZOMBIE;
+ usagemode_param = USAGE_MODE_ZOMBIE;
+ } else {
+ printk("Error: Not in zombie mode and aperture is 0\n");
+ err = -EINVAL;
+ goto adap_init_unmapmmio;
+ }
+ } else {
+ if (mic_ctx->ioremapworkq) {
+ queue_work(mic_ctx->ioremapworkq, &mic_ctx->ioremapwork);
+ } else {
+ if ((mic_ctx->aper.va = ioremap_wc(mic_ctx->aper.pa, mic_ctx->aper.len)) == NULL) {
+ printk("mic %d: failed to map aperture space\n", mic_ctx->bi_id);
+ err = -EINVAL;
+ goto adap_init_unmapmmio;
+ }
+ }
+ }
+
+ mic_debug_init(mic_ctx);
+ mic_smpt_init(mic_ctx);
+#ifdef USE_VCONSOLE
+ // Allocate memory for PCI serial console
+ mic_ctx->bi_vcons.dc_buf_virt = (void *)get_zeroed_page(GFP_KERNEL);
+ mic_ctx->bi_vcons.dc_hdr_virt = kzalloc(sizeof(struct vcons_buf), GFP_KERNEL);
+
+ if ((!mic_ctx->bi_vcons.dc_buf_virt) || (!mic_ctx->bi_vcons.dc_hdr_virt)) {
+ printk(KERN_ERR "mic %d: failed to allocate memory for vcons buffer\n",
+ mic_ctx->bi_id);
+ mic_ctx->bi_vcons.dc_enabled = 0;
+ if (mic_ctx->bi_vcons.dc_buf_virt)
+ free_pages((uint64_t)mic_ctx->bi_vcons.dc_buf_virt, 0);
+ if (mic_ctx->bi_vcons.dc_hdr_virt)
+ kfree(mic_ctx->bi_vcons.dc_hdr_virt);
+ } else {
+ mic_ctx->bi_vcons.dc_hdr_dma_addr = mic_ctx_map_single(mic_ctx,
+ mic_ctx->bi_vcons.dc_hdr_virt,
+ sizeof(struct vcons_buf));
+ mic_ctx->bi_vcons.dc_dma_addr = mic_ctx_map_single(mic_ctx,
+ mic_ctx->bi_vcons.dc_buf_virt,
+ MICVCONS_BUF_SIZE);
+ if ((!mic_ctx->bi_vcons.dc_dma_addr) ||
+ (!mic_ctx->bi_vcons.dc_hdr_dma_addr))
+ mic_ctx->bi_vcons.dc_enabled = 0;
+ else
+ mic_ctx->bi_vcons.dc_enabled = 1;
+ mic_ctx->bi_vcons.dc_size = MICVCONS_BUF_SIZE;
+ vcons_buf = (struct vcons_buf *)(mic_ctx->bi_vcons.dc_hdr_virt);
+ vcons_buf->o_buf_dma_addr = mic_ctx->bi_vcons.dc_dma_addr;
+ vcons_buf->o_size = MICVCONS_BUF_SIZE;
+ smp_wmb();
+ vcons_buf->host_magic = MIC_HOST_VCONS_READY;
+ vcons_buf->host_rb_ver = micscif_rb_get_version();
+ }
+#endif // USE_VCONSOLE
+ mic_ctx->boot_mem = 0;
+ mic_psmi_init(mic_ctx);
+ mic_ctx->dma_handle = NULL;
+ mic_ctx->sdbic1 = 0;
+ // To avoid hazard on Windows, sku_build_table is done on DriverEntry
+ sku_build_table();
+ device_id = mic_ctx->bi_pdev->device;
+ sku_find(mic_ctx, device_id);
+ // To avoid hazard on Windows, sku_destroy_table is done on MicUnload
+ sku_destroy_table();
+
+ /* Determine the amount of compensation that needs to be applied to MIC's ETC timer */
+ calculate_etc_compensation(mic_ctx);
+
+ return 0;
+
+adap_init_unmapmmio:
+ iounmap(mic_ctx->mmio.va);
+destroy_remap_wq:
+ destroy_workqueue(mic_ctx->ioremapworkq);
+destroy_reset_wq:
+ destroy_workqueue(mic_ctx->resetworkq);
+ return err;
+}
+
+void
+mic_enable_interrupts(mic_ctx_t *mic_ctx)
+{
+ ENABLE_MIC_INTERRUPTS(mic_ctx->mmio.va);
+}
+
+void
+mic_disable_interrupts(mic_ctx_t *mic_ctx)
+{
+ uint32_t sboxSice0reg;
+
+ sboxSice0reg = SBOX_READ(mic_ctx->mmio.va, SBOX_SICE0);
+ SBOX_WRITE(sboxSice0reg, mic_ctx->mmio.va, SBOX_SICC0);
+}
+
+void
+mic_enable_msi_interrupts(mic_ctx_t *mic_ctx)
+{
+ uint32_t sboxMXARreg;
+
+ // Only support single MSI interrupt for now
+ sboxMXARreg = SBOX_SICE0_DBR_BITS(0xf) | SBOX_SICE0_DMA_BITS(0xff);
+ if (mic_ctx->bi_family == FAMILY_KNC)
+ SBOX_WRITE(sboxMXARreg, mic_ctx->mmio.va, SBOX_MXAR0_K1OM);
+ else
+ SBOX_WRITE(sboxMXARreg, mic_ctx->mmio.va, SBOX_MXAR0);
+}
+
+int
+mic_reg_irqhandler(mic_ctx_t *mic_ctx, int doorbell, char *idstring,
+ int (*irqfunc)(mic_ctx_t *mic_ctx, int doorbell))
+{
+ mic_irqhandler_t *irqhandle;
+ unsigned long flags;
+
+ if (doorbell > MIC_IRQ_MAX) {
+ return EINVAL;
+ }
+
+ if (!(irqhandle = kmalloc(sizeof(mic_irqhandler_t), GFP_ATOMIC)))
+ goto memerror1;
+
+ if (!(irqhandle->ih_idstring = kmalloc(strlen(idstring) + 1, GFP_ATOMIC)))
+ goto memerror2;
+
+ irqhandle->ih_func = irqfunc;
+ strcpy(irqhandle->ih_idstring, idstring);
+
+ spin_lock_irqsave(&mic_ctx->bi_irq.mi_lock, flags);
+ list_add_tail(&irqhandle->ih_list, &mic_ctx->bi_irq.mi_dblist[doorbell]);
+ spin_unlock_irqrestore(&mic_ctx->bi_irq.mi_lock, flags);
+ return 0;
+
+memerror2:
+ kfree(irqhandle);
+memerror1:
+ return -ENOMEM;
+}
+
+int
+mic_unreg_irqhandler(mic_ctx_t *mic_ctx, int doorbell, char *idstring)
+{
+ mic_irqhandler_t *irqhandle;
+ struct list_head *pos, *tmpq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&mic_ctx->bi_irq.mi_lock, flags);
+ list_for_each_safe(pos, tmpq, &mic_ctx->bi_irq.mi_dblist[doorbell]) {
+ irqhandle = list_entry(pos, mic_irqhandler_t, ih_list);
+ if (strcmp(idstring, irqhandle->ih_idstring) == 0) {
+ list_del(pos);
+ kfree(irqhandle->ih_idstring);
+ kfree(irqhandle);
+ }
+ }
+ spin_unlock_irqrestore(&mic_ctx->bi_irq.mi_lock, flags);
+
+ return 0;
+}
+
+static __always_inline
+void adapter_process_one_interrupt(mic_ctx_t *mic_ctx, uint32_t events)
+{
+ mic_irqhandler_t *irqhandle;
+ struct list_head *pos;
+ int doorbell;
+
+ atomic_inc(&mic_ctx->bi_irq.mi_received);
+
+ if (SBOX_SICR0_DBR(events)) {
+ for (doorbell = 0; doorbell < 4; doorbell++) {
+ if (SBOX_SICR0_DBR(events) & (0x1 << doorbell)) {
+ spin_lock(&mic_ctx->bi_irq.mi_lock);
+ list_for_each(pos, &mic_ctx->bi_irq.mi_dblist[doorbell]) {
+ irqhandle = list_entry(pos, mic_irqhandler_t, ih_list);
+ irqhandle->ih_func(mic_ctx, doorbell);
+ }
+ spin_unlock(&mic_ctx->bi_irq.mi_lock);
+ }
+ }
+
+ }
+
+ if (SBOX_SICR0_DMA(events))
+ host_dma_interrupt_handler(mic_ctx->dma_handle, events);
+}
+
+int
+adapter_isr(mic_ctx_t *mic_ctx)
+{
+ volatile uint32_t sboxSicr0reg;
+ if (atomic_cmpxchg(&mic_ctx->gate_interrupt, 0, 1) == 1)
+ return -1;
+
+ sboxSicr0reg = SBOX_READ(mic_ctx->mmio.va, SBOX_SICR0);
+
+ if (unlikely(!sboxSicr0reg)) {
+ // Spurious interrupt
+ atomic_set(&mic_ctx->gate_interrupt, 0);
+ return -1;
+ }
+
+ // tell mic that we recived interrupt otherwise it will keep sending them
+ SBOX_WRITE(sboxSicr0reg, mic_ctx->mmio.va, SBOX_SICR0);
+
+ // This only applies to KNC B0
+ if (FAMILY_KNC == mic_ctx->bi_family &&
+ mic_ctx->bi_stepping >= KNC_B0_STEP)
+ mic_enable_interrupts(mic_ctx);
+
+ atomic_set(&mic_ctx->gate_interrupt, 0);
+ adapter_process_one_interrupt(mic_ctx, sboxSicr0reg);
+ return 0;
+}
+
+int
+adapter_imsr(mic_ctx_t *mic_ctx)
+{
+#if 0 /* TODO: disable interrupt when KNC auto-enable isn't used */
+ mic_disable_interrupts(mic_ctx);
+#endif
+ tasklet_schedule(&mic_ctx->bi_dpc);
+ return 0;
+}
+
+static void adapter_dpc(unsigned long dpc)
+{
+ mic_ctx_t *mic_ctx =
+ container_of((struct tasklet_struct *)dpc, mic_ctx_t, bi_dpc);
+
+ volatile uint32_t sboxSicr0reg;
+
+ if (atomic_cmpxchg(&mic_ctx->gate_interrupt, 0, 1) == 1)
+ return;
+
+ /* Clear pending bit array */
+ if (FAMILY_KNC == mic_ctx->bi_family) {
+ if (KNC_A_STEP == mic_ctx->bi_stepping)
+ SBOX_WRITE(1, mic_ctx->mmio.va, SBOX_MSIXPBACR_K1OM);
+ } else
+ SBOX_WRITE(1, mic_ctx->mmio.va, SBOX_MSIXPBACR);
+
+ sboxSicr0reg = SBOX_READ(mic_ctx->mmio.va, SBOX_SICR0);
+ if (unlikely(!sboxSicr0reg)) {
+ atomic_set(&mic_ctx->gate_interrupt, 0);
+ return;
+ }
+
+ SBOX_WRITE(sboxSicr0reg, mic_ctx->mmio.va, SBOX_SICR0);
+
+ // This only applies to KNC B0
+ if (FAMILY_KNC == mic_ctx->bi_family &&
+ mic_ctx->bi_stepping >= KNC_B0_STEP)
+ mic_enable_interrupts(mic_ctx);
+
+ atomic_set(&mic_ctx->gate_interrupt, 0);
+ adapter_process_one_interrupt(mic_ctx, sboxSicr0reg);
+}
+
+void ramoops_init(void)
+{
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ ramoops_dir = proc_mkdir("mic_ramoops", NULL);
+#else
+ ramoops_dir = create_proc_entry("mic_ramoops", S_IFDIR | S_IRUGO, NULL);
+#endif
+}
+
+void ramoops_exit(void)
+{
+ remove_proc_entry("mic_ramoops", NULL);
+}
+
+void ramoops_remove(mic_ctx_t *mic_ctx)
+{
+ char name[64];
+ int i;
+
+ snprintf(name, 64, "mic%d", mic_ctx->bi_id);
+ remove_proc_entry(name, ramoops_dir);
+
+ snprintf(name, 64, "mic%d_prev", mic_ctx->bi_id);
+ remove_proc_entry(name, ramoops_dir);
+ if (mic_ctx->ramoops_size == 0)
+ return;
+
+ for (i = 0; i < 2; i++) {
+ if (mic_ctx->ramoops_va[i] != NULL) {
+ mic_ctx_unmap_single(mic_ctx, mic_ctx->ramoops_pa[i],
+ mic_ctx->ramoops_size);
+ kfree(mic_ctx->ramoops_va[i]);
+ }
+ }
+}
+
+void vmcore_init(void)
+{
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ vmcore_dir = proc_mkdir("mic_vmcore", NULL);
+#else
+ vmcore_dir = create_proc_entry("mic_vmcore", S_IFDIR | S_IRUGO, NULL);
+#endif
+}
+
+void vmcore_exit(void)
+{
+ if (vmcore_dir) {
+ remove_proc_entry("mic_vmcore", NULL);
+ vmcore_dir = NULL;
+ }
+}
+
+void vmcore_remove(mic_ctx_t *mic_ctx)
+{
+ char name[64];
+
+ snprintf(name, 64, "mic%d", mic_ctx->bi_id);
+ if (mic_ctx->vmcore_dir) {
+ remove_proc_entry(name, vmcore_dir);
+ mic_ctx->vmcore_dir = NULL;
+ }
+ if (mic_ctx->elfcorebuf) {
+ kfree(mic_ctx->elfcorebuf);
+ mic_ctx->elfcorebuf = NULL;
+ mic_ctx->elfcorebuf_sz = 0;
+ mic_ctx->vmcore_size = 0;
+ }
+}
+
+
+void
+adapter_init(void)
+{
+ // Per driver init ONLY.
+ mic_dma_init();
+ micscif_init();
+ micpm_init();
+ ramoops_init();
+ vmcore_init();
+ INIT_LIST_HEAD(&mic_data.dd_bdlist);
+}
+
+
+void show_stepping_comm(mic_ctx_t *mic_ctx,char *buf)
+{
+#define STEPINGSTRSIZE 3
+ char string[STEPINGSTRSIZE];
+ switch (mic_ctx->bi_family) {
+ case FAMILY_ABR:
+ switch (mic_ctx->bi_stepping) {
+ case 0:
+ string[0] = 'A';
+ string[1] = mic_ctx->bi_substepping + '0';
+ break;
+ case 2:
+ string[0] = 'B';
+ string[1] = '0';
+ break;
+ case 3:
+ string[0] = 'B';
+ string[1] = '1';
+ break;
+ case 4:
+ string[0] = 'C';
+ string[1] = '0';
+ break;
+ case 5:
+ string[0] = 'C';
+ string[1] = '1';
+ break;
+ case 6:
+ string[0] = 'D';
+ string[1] = '0';
+ break;
+ default:
+ string[0] = '?';
+ string[1] = '?';
+ break;
+ }
+ break;
+ case FAMILY_KNC:
+ switch (mic_ctx->bi_stepping) {
+ case KNC_A_STEP:
+ string[0] = 'A';
+ string[1] = '0';
+ break;
+ case KNC_B0_STEP:
+ string[0] = 'B';
+ string[1] = '0';
+ break;
+ case KNC_B1_STEP:
+ string[0] = 'B';
+ string[1] = '1';
+ break;
+ case KNC_C_STEP:
+ string[0] = 'C';
+ string[1] = '0';
+ break;
+ default:
+ string[0] = '?';
+ string[1] = '?';
+ break;
+ }
+ break;
+ default:
+ string[0] = '?';
+ string[1] = '?';
+ break;
+ }
+
+ string[2] = '\0';
+
+ strncpy(buf,string,STEPINGSTRSIZE);
+}
+
+
--- /dev/null
+ /*
+ * Copyright (C) 2009 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+
+ * (C) Badari Pulavarty pbadari@us.ibm.com 2010 with the following comment.
+ * He posted on http://lwn.net/Articles/382543/
+
+ * virtio-block server in host kernel.
+ * Inspired by vhost-net and shamlessly ripped code from it :)
+
+ * For adapting to MIC
+ * (C) Copyright 2012 Intel Corporation
+ * Author: Caz Yokoyama <Caz.Yokoyama@intel.com>
+ */
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34) || \
+ defined(RHEL_RELEASE_CODE)
+
+#include <linux/compat.h>
+#include <linux/eventfd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_blk.h>
+#include <linux/mmu_context.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/rcupdate.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+
+#ifndef VIRTIO_RING_F_EVENT_IDX /* virtio_ring.h of rhel6.0 does not define */
+#define VIRTIO_RING_F_EVENT_IDX 29
+#endif
+#include "mic_common.h"
+#include "mic/micveth_dma.h"
+#include "vhost.h"
+#include "mic/mic_virtio.h"
+
+#define SECTOR_SHIFT 9
+#define SECTOR_SIZE (1UL << SECTOR_SHIFT)
+#define VIRTIO_BLK_QUEUE_SIZE 128
+#define DISK_SEG_MAX (VIRTIO_BLK_QUEUE_SIZE - 2)
+
+#define VHOST_BLK_VQ_MAX 1
+#define WQNAME_SIZE 16
+
+struct vhost_blk {
+ struct vhost_dev dev;
+ struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+ struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+ struct workqueue_struct *vb_wq;
+ char vb_wqname[WQNAME_SIZE];
+ struct work_struct vb_ws_bh;
+ struct workqueue_struct *vblk_workqueue;
+ struct board_info *bd_info;
+ char *file_name;
+ struct file *virtblk_file;
+};
+
+struct vhost_blk_io {
+ struct list_head list;
+ struct work_struct work;
+ struct vhost_blk *blk;
+ struct file *file;
+ int head;
+ uint32_t type;
+ uint32_t nvecs;
+ uint64_t sector;
+ uint64_t len;
+ struct iovec iov[0];
+};
+
+#define mic_addr_in_host(va, pa) ((u8 *)(va) + (u64)(pa))
+
+static LIST_HEAD(write_queue);
+static LIST_HEAD(read_queue);
+
+static void
+cleanup_vblk_workqueue(struct vhost_blk_io *vbio, struct vhost_virtqueue *vq)
+{
+ struct list_head single, *head, *node, *tmp;
+ int need_free;
+ struct vhost_blk_io *entry;
+
+ if (vbio->head != -1) {
+ INIT_LIST_HEAD(&single);
+ list_add(&vbio->list, &single);
+ head = &single;
+ need_free = 0;
+ } else {
+ head = &vbio->list;
+ need_free = 1;
+ }
+
+ mutex_lock(&vq->mutex);
+ list_for_each_safe(node, tmp, head) {
+ entry = list_entry(node, struct vhost_blk_io, list);
+ list_del(node);
+ kfree(entry);
+ }
+ mutex_unlock(&vq->mutex);
+
+ if (need_free)
+ kfree(vbio);
+}
+
+static void handle_io_work(struct work_struct *work)
+{
+ struct vhost_blk_io *vbio, *entry;
+ struct vhost_virtqueue *vq;
+ struct vhost_blk *blk;
+ struct list_head single, *head, *node, *tmp;
+ struct iovec *iov;
+ uint8_t *aper_va;
+ struct vring *vring;
+ unsigned int num;
+
+ int need_free, ret = 0;
+ loff_t pos;
+ uint8_t status = 0;
+
+ vbio = container_of(work, struct vhost_blk_io, work);
+ blk = vbio->blk;
+ vq = &blk->dev.vqs[0];
+ pos = vbio->sector << SECTOR_SHIFT;
+ aper_va = blk->bd_info->bi_ctx.aper.va;
+
+ vring = &((struct mic_virtblk *)blk->bd_info->bi_virtio)->vb_shared.vring;
+ num = readl(&vring->num);
+ if (num == 0 || micpm_get_reference(&blk->bd_info->bi_ctx, true)) {
+ cleanup_vblk_workqueue(vbio, vq);
+ return;
+ }
+
+ if (atomic64_read(&vbio->file->f_count) == 0) { /* file is closed */
+ ret = -1;
+ } else if (vbio->type & VIRTIO_BLK_T_FLUSH) {
+#ifdef RHEL_RELEASE_CODE
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ ret = vfs_fsync(vbio->file, 1);
+#else
+ ret = vfs_fsync(vbio->file, vbio->file->f_path.dentry, 1);
+#endif
+#else
+ ret = vfs_fsync(vbio->file, 1);
+#endif
+ } else if (vbio->type & VIRTIO_BLK_T_OUT) {
+ for (iov = vbio->iov; iov < &vbio->iov[vbio->nvecs]; iov++) {
+ iov->iov_base = mic_addr_in_host(aper_va, iov->iov_base);
+ }
+ ret = vfs_writev(vbio->file, vbio->iov, vbio->nvecs, &pos);
+ } else {
+ for (iov = vbio->iov; iov < &vbio->iov[vbio->nvecs]; iov++) {
+ iov->iov_base = mic_addr_in_host(aper_va, iov->iov_base);
+ }
+ ret = vfs_readv(vbio->file, vbio->iov, vbio->nvecs, &pos);
+ }
+ status = (ret < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+ if (vbio->head != -1) {
+ INIT_LIST_HEAD(&single);
+ list_add(&vbio->list, &single);
+ head = &single;
+ need_free = 0;
+ } else {
+ head = &vbio->list;
+ need_free = 1;
+ }
+ list_for_each_entry(entry, head, list) {
+ memcpy_toio(mic_addr_in_host(aper_va, entry->iov[entry->nvecs].iov_base), &status, sizeof(status));
+ }
+ mutex_lock(&vq->mutex);
+ list_for_each_safe(node, tmp, head) {
+ entry = list_entry(node, struct vhost_blk_io, list);
+ vhost_add_used_and_signal(&blk->dev, vq, entry->head, ret);
+ list_del(node);
+ kfree(entry);
+ }
+ mutex_unlock(&vq->mutex);
+ if (need_free)
+ kfree(vbio);
+ micpm_put_reference(&blk->bd_info->bi_ctx);
+}
+
+static struct vhost_blk_io *allocate_vbio(int nvecs)
+{
+ struct vhost_blk_io *vbio;
+ int size = sizeof(struct vhost_blk_io) + nvecs * sizeof(struct iovec);
+ vbio = kmalloc(size, GFP_KERNEL);
+ if (vbio) {
+ INIT_WORK(&vbio->work, handle_io_work);
+ INIT_LIST_HEAD(&vbio->list);
+ }
+ return vbio;
+}
+
+static void merge_and_handoff_work(struct list_head *queue)
+{
+ struct vhost_blk_io *vbio, *entry;
+ int nvecs = 0;
+ int entries = 0;
+
+ list_for_each_entry(entry, queue, list) {
+ nvecs += entry->nvecs;
+ entries++;
+ }
+
+ if (entries == 1) {
+ vbio = list_first_entry(queue, struct vhost_blk_io, list);
+ list_del(&vbio->list);
+ queue_work(vbio->blk->vblk_workqueue, &vbio->work);
+ return;
+ }
+
+ vbio = allocate_vbio(nvecs);
+ if (!vbio) {
+ /* Unable to allocate memory - submit IOs individually */
+ list_for_each_entry(vbio, queue, list) {
+ queue_work(vbio->blk->vblk_workqueue, &vbio->work);
+ }
+ INIT_LIST_HEAD(queue);
+ return;
+ }
+
+ entry = list_first_entry(queue, struct vhost_blk_io, list);
+ vbio->nvecs = nvecs;
+ vbio->blk = entry->blk;
+ vbio->file = entry->file;
+ vbio->type = entry->type;
+ vbio->sector = entry->sector;
+ vbio->head = -1;
+ vbio->len = 0;
+ nvecs = 0;
+
+ list_for_each_entry(entry, queue, list) {
+ memcpy(&vbio->iov[nvecs], entry->iov, entry->nvecs * sizeof(struct iovec));
+ nvecs += entry->nvecs;
+ vbio->len += entry->len;
+ }
+ list_replace_init(queue, &vbio->list);
+ queue_work(vbio->blk->vblk_workqueue, &vbio->work);
+}
+
+static void start_io(struct list_head *queue)
+{
+ struct list_head start;
+ struct vhost_blk_io *vbio = NULL, *entry;
+
+ if (list_empty(queue))
+ return;
+
+ list_for_each_entry(entry, queue, list) {
+ if (!vbio) {
+ vbio = entry;
+ continue;
+ }
+ if (vbio->sector + (vbio->len >> SECTOR_SHIFT) == entry->sector) {
+ vbio = entry;
+ } else {
+ INIT_LIST_HEAD(&start);
+ list_cut_position(&start, queue, &vbio->list);
+ merge_and_handoff_work(&start);
+ vbio = entry;
+ }
+ }
+ if (!list_empty(queue))
+ merge_and_handoff_work(queue);
+}
+
+static uint64_t calculate_len(struct iovec *iov, int nvecs)
+{
+ uint64_t len = 0;
+ int i;
+
+ for (i=0; i<nvecs; i++)
+ len += iov[i].iov_len;
+ return len;
+}
+
+static void insert_to_queue(struct vhost_blk_io *vbio,
+ struct list_head *queue)
+{
+ struct vhost_blk_io *entry;
+
+ list_for_each_entry(entry, queue, list) {
+ if (entry->sector > vbio->sector)
+ break;
+ }
+ list_add_tail(&vbio->list, &entry->list);
+}
+
+static int handoff_io(struct vhost_blk *blk, int head,
+ uint32_t type, uint64_t sector,
+ struct iovec *iov, int nvecs)
+{
+ struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+ struct vhost_blk_io *vbio;
+
+ vbio = allocate_vbio(nvecs+1);
+ if (!vbio) {
+ return -ENOMEM;
+ }
+ vbio->blk = blk;
+ vbio->head = head;
+ vbio->file = vq->private_data;
+ vbio->type = type;
+ vbio->sector = sector;
+ vbio->nvecs = nvecs;
+ vbio->len = calculate_len(iov, nvecs);
+ memcpy(vbio->iov, iov, (nvecs + 1) * sizeof(struct iovec));
+
+ if (vbio->type & VIRTIO_BLK_T_FLUSH) {
+#if 0
+ /* Sync called - do I need to submit IOs in the queue ? */
+ start_io(&read_queue);
+ start_io(&write_queue);
+#endif
+ queue_work(blk->vblk_workqueue, &vbio->work);
+ } else if (vbio->type & VIRTIO_BLK_T_OUT) {
+ insert_to_queue(vbio, &write_queue);
+ } else {
+ insert_to_queue(vbio, &read_queue);
+ }
+ return 0;
+}
+
+static void handle_blk(struct vhost_blk *blk)
+{
+ struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+ unsigned head, out, in;
+ struct virtio_blk_outhdr hdr;
+ int nvecs;
+ struct board_info *bd_info = blk->bd_info;
+ struct vring *vring;
+
+ vring = &((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared.vring;
+ if (vring == 0 || readl(&vring->num) == 0) {
+ printk("request comes in while card side driver is not loaded yet. Ignore\n");
+ return;
+ }
+ /* the first time since the card side driver becomes ready */
+ if (vq->desc == NULL || readb(&((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared.update)) {
+ vq->num = readl(&vring->num);
+ vq->desc = (struct vring_desc *)readq(&vring->desc);
+ vq->avail = (struct vring_avail *)readq(&vring->avail);
+ vq->used = (struct vring_used *)readq(&vring->used);
+ vq->last_avail_idx = 0;
+ vq->avail_idx = 0;
+ vq->last_used_idx = 0;
+ vq->signalled_used = 0;
+ vq->signalled_used_valid = false;
+ vq->done_idx = 0;
+ writeb(false, &((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared.update);
+ }
+
+ if (micpm_get_reference(&blk->bd_info->bi_ctx, true))
+ return;
+
+ mutex_lock(&vq->mutex);
+
+ vhost_disable_notify(&blk->dev, vq);
+
+ for (;;) {
+ head = vhost_get_vq_desc(&blk->dev, vq, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in, NULL, NULL);
+ if ((head == vq->num) || (head == -EFAULT) || (head == -EINVAL)) {
+ if (unlikely(vhost_enable_notify(&blk->dev, vq))) {
+ vhost_disable_notify(&blk->dev, vq);
+ continue;
+ }
+ start_io(&read_queue);
+ start_io(&write_queue);
+ break;
+ }
+
+ BUG_ON(vq->iov[0].iov_len != 16);
+
+ memcpy_fromio(&hdr, mic_addr_in_host(bd_info->bi_ctx.aper.va, vq->iov[0].iov_base),
+ sizeof(hdr));
+
+ nvecs = out - 1;
+ if (hdr.type == VIRTIO_BLK_T_IN)
+ nvecs = in - 1;
+
+ BUG_ON(vq->iov[nvecs+1].iov_len != 1);
+ if (handoff_io(blk, head, hdr.type, hdr.sector, &vq->iov[1], nvecs) < 0) {
+ vhost_discard_vq_desc(vq, 1);
+ continue;
+ }
+ }
+ mutex_unlock(&vq->mutex);
+ micpm_put_reference(&blk->bd_info->bi_ctx);
+}
+
+static void handle_blk_kick(struct work_struct *work)
+{
+ struct vhost_blk *vblk;
+
+ vblk = container_of(work, struct vhost_blk, vb_ws_bh);
+ handle_blk(vblk);
+}
+
+#if 0
+static void handle_rq_blk(struct vhost_work *work)
+{
+ struct vhost_blk *blk;
+
+ blk = container_of(work, struct vhost_blk, poll[0].work);
+ handle_blk(blk);
+}
+#endif
+
+static int
+vhost_doorbell_intr_handler(mic_ctx_t *mic_ctx, int doorbell)
+{
+ struct board_info *bi;
+ struct vhost_blk *vblk;
+
+ bi = container_of(mic_ctx, struct board_info, bi_ctx);
+ vblk = ((struct mic_virtblk *)bi->bi_virtio)->vblk;
+ queue_work(vblk->vb_wq, &vblk->vb_ws_bh);
+
+ return 0;
+}
+
+static long vhost_blk_set_backend(struct vhost_blk *vblk)
+{
+ struct vhost_virtqueue *vq;
+ struct board_info *bd_info = vblk->bd_info;
+ unsigned index = bd_info->bi_ctx.bi_id;
+ struct vb_shared *vb_shared;
+ int ret = 0;
+ struct kstat stat;
+ unsigned int virtio_blk_features = (1U << VIRTIO_BLK_F_SEG_MAX) |
+ (1U << VIRTIO_BLK_F_BLK_SIZE);
+
+ if (index >= MAX_BOARD_SUPPORTED) {
+ ret = -ENOBUFS;
+ goto _exit_;
+ }
+ if (vblk->virtblk_file == NULL) {
+ ret = -EBADF;
+ goto _exit_;
+ }
+
+ vq = &vblk->vqs[0];
+ mutex_lock(&vq->mutex);
+ rcu_assign_pointer(vq->private_data, vblk->virtblk_file);
+ mutex_unlock(&vq->mutex);
+
+ snprintf(vblk->vb_wqname, sizeof(vblk->vb_wqname),
+ "virtblk wq %d", index);
+ vblk->vb_wq = __mic_create_singlethread_workqueue(vblk->vb_wqname);
+ if (vblk->vb_wq == NULL) {
+ ret = -ENOMEM;
+ goto _exit_;
+ }
+ INIT_WORK(&vblk->vb_ws_bh, handle_blk_kick);
+
+ /* They have to be accessed from "struct vhost_virtqueue *vq" in mic_vhost.c.
+ They are not used in vhost block. I don't modify vhost.h. */
+ vq->log_base = (void __user *)&bd_info->bi_ctx;
+ vq->log_addr = (u64)bd_info->bi_ctx.aper.va;
+
+ vb_shared = &((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0))
+ virtio_blk_features |= (1U << VIRTIO_BLK_F_FLUSH);
+#endif
+ writel(virtio_blk_features, &vb_shared->host_features);
+ writel(DISK_SEG_MAX, &vb_shared->blk_config.seg_max);
+ writel(SECTOR_SIZE, &vb_shared->blk_config.blk_size);
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0))
+ ret = vfs_getattr(&vblk->virtblk_file->f_path, &stat);
+#else
+ ret = vfs_getattr(vblk->virtblk_file->f_path.mnt,
+ vblk->virtblk_file->f_path.dentry, &stat);
+#endif
+ if (ret < 0)
+ goto _exit_;
+
+ if (S_ISBLK(stat.mode)) {
+ writel(i_size_read(I_BDEV(vblk->virtblk_file->f_mapping->host)->bd_inode) / SECTOR_SIZE,
+ &vb_shared->blk_config.capacity);
+ } else {
+ writel(stat.size / SECTOR_SIZE, &vb_shared->blk_config.capacity);
+ }
+
+ ret = mic_reg_irqhandler(&bd_info->bi_ctx, MIC_IRQ_DB2, "Host DoorBell 2",
+ vhost_doorbell_intr_handler);
+
+_exit_:
+ return ret;
+}
+
+void
+mic_vhost_blk_stop(bd_info_t *bd_info)
+{
+ struct vring *vring = &((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared.vring;
+
+ writel(0, &vring->num); /* reject subsequent request from MIC card */
+}
+
+extern bd_info_t *dev_to_bdi(struct device *dev);
+
+ssize_t
+show_virtblk_file(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct board_info *bd_info = dev_to_bdi(dev);
+ struct mic_virtblk *mic_virtblk;
+ struct vhost_blk *vblk;
+
+ BUG_ON(bd_info == NULL);
+ mic_virtblk = bd_info->bi_virtio;
+ BUG_ON(mic_virtblk == NULL);
+ vblk = mic_virtblk->vblk;
+ BUG_ON(vblk == NULL);
+
+ if (vblk->file_name != NULL)
+ return snprintf(buf, PAGE_SIZE, "%s\n", vblk->file_name);
+ else
+ return 0;
+}
+
+ssize_t
+store_virtblk_file(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret = 0;
+ struct board_info *bd_info = dev_to_bdi(dev);
+ struct mic_virtblk *mic_virtblk;
+ struct vhost_blk *vblk;
+ struct vhost_virtqueue *vq;
+ char *p;
+ struct file *virtblk_file;
+
+ BUG_ON(bd_info == NULL);
+ mic_virtblk = bd_info->bi_virtio;
+ BUG_ON(mic_virtblk == NULL);
+ vblk = mic_virtblk->vblk;
+ BUG_ON(vblk == NULL);
+ vq = &vblk->vqs[0];
+ BUG_ON(vq == NULL);
+
+ if (buf == NULL) {
+ ret = -EINVAL;
+ goto _return_;
+ }
+ if (count <= 1) {
+ ret = -EINVAL;
+ goto _return_;
+ }
+
+ p = strchr(buf, '\n');
+ if (p != NULL)
+ *p = '\0';
+
+ mutex_lock(&vq->mutex);
+ if (vblk->virtblk_file != NULL) { /* if virtblk file is already assigned */
+ printk(KERN_ALERT "you are changing virtblk file: %s -> %s.\n", vblk->file_name, buf);
+ kfree(vblk->file_name);
+ vblk->file_name = NULL;
+ filp_close(vblk->virtblk_file, current->files);
+ vblk->virtblk_file = NULL;
+ }
+
+ vblk->file_name = kmalloc(count + 1, GFP_KERNEL);
+ strcpy(vblk->file_name, buf);
+ virtblk_file = filp_open(vblk->file_name, O_RDWR|O_LARGEFILE, 0);
+ if (IS_ERR(virtblk_file)) {
+ ret = PTR_ERR(virtblk_file);
+ mutex_unlock(&vq->mutex);
+ goto free_file_name;
+ }
+ vblk->virtblk_file = virtblk_file;
+ mutex_unlock(&vq->mutex);
+
+ ret = vhost_blk_set_backend(vblk);
+ if (ret < 0)
+ goto close_virtblk_file;
+
+ return count;
+
+ close_virtblk_file:
+ filp_close(vblk->virtblk_file, current->files);
+ free_file_name:
+ kfree(vblk->file_name);
+ _return_:
+ return ret;
+}
+
+int mic_vhost_blk_probe(bd_info_t *bd_info)
+{
+ int ret = 0;
+ char wq_name[8];
+ struct mic_virtblk *mic_virtblk;
+ struct vhost_blk *vblk;
+
+ mic_virtblk = kzalloc(sizeof(*mic_virtblk), GFP_KERNEL);
+ if (mic_virtblk == NULL) {
+ ret = -ENOMEM;
+ goto err_vblk;
+ }
+ bd_info->bi_virtio = mic_virtblk;
+
+ vblk = kzalloc(sizeof *vblk, GFP_KERNEL);
+ if (vblk == NULL) {
+ ret = -ENOMEM;
+ goto free_mic_virtblk;
+ }
+ mic_virtblk->vblk = vblk;
+ vblk->bd_info = bd_info;
+
+ ret = vhost_dev_init(&vblk->dev, vblk->vqs, VHOST_BLK_VQ_MAX);
+ if (ret < 0)
+ goto free_vblk;
+
+#if 0
+ vhost_poll_init(vblk->poll, handle_rq_blk, POLLOUT|POLLIN, &vblk->dev);
+#endif
+
+ BUG_ON(bd_info->bi_ctx.bi_id >= 1000);
+ snprintf(wq_name, ARRAY_SIZE(wq_name), "vblk%03d", bd_info->bi_ctx.bi_id);
+ vblk->vblk_workqueue = __mic_create_singlethread_workqueue(wq_name);
+ if (vblk->vblk_workqueue == NULL) {
+ ret = -ENOMEM;
+ goto free_vblk;
+ }
+
+ return ret;
+
+ free_vblk:
+ kfree(vblk);
+ free_mic_virtblk:
+ kfree(mic_virtblk);
+ err_vblk:
+ return ret;
+}
+
+void mic_vhost_blk_remove(bd_info_t *bd_info)
+{
+ struct mic_virtblk *mic_virtblk = bd_info->bi_virtio;
+ struct vhost_blk *vblk = mic_virtblk->vblk;
+ struct vb_shared *vb_shared = &mic_virtblk->vb_shared;
+
+ if (vblk->virtblk_file != NULL) {
+ mic_unreg_irqhandler(&bd_info->bi_ctx, MIC_IRQ_DB2, "Host DoorBell 2");
+ memset(&vb_shared->blk_config, 0, sizeof(vb_shared->blk_config));
+ destroy_workqueue(vblk->vb_wq);
+ if (vblk->vqs[0].private_data != NULL)
+ fput(vblk->vqs[0].private_data);
+ kfree(vblk->file_name);
+ filp_close(vblk->virtblk_file, current->files);
+ }
+ vhost_dev_cleanup(&vblk->dev);
+ destroy_workqueue(vblk->vblk_workqueue);
+ kfree(vblk);
+ kfree(mic_virtblk);
+}
+#endif
--- /dev/null
+/* Copyright (C) 2009 Red Hat, Inc.
+ * Copyright (C) 2006 Rusty Russell IBM Corporation
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * (C) Badari Pulavarty pbadari@us.ibm.com 2010 with the following comment.
+ * Inspiration, some code, and most witty comments come from
+ * Documentation/lguest/lguest.c, by Rusty Russell
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+
+ * For adapting to MIC
+ * (C) Copyright 2012 Intel Corporation
+ * Author: Caz Yokoyama <Caz.Yokoyama@intel.com>
+ *
+ * Generic code for virtio server in host kernel.
+ */
+
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34)) || \
+ defined(RHEL_RELEASE_CODE)
+
+#include <linux/eventfd.h>
+#ifdef RHEL_RELEASE_CODE
+#include <linux/vhost.h>
+#else
+#include "./linux/vhost.h"
+#endif
+#include <linux/virtio_net.h>
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <linux/cgroup.h>
+
+#include <linux/net.h>
+#include <linux/if_packet.h>
+#include <linux/if_arp.h>
+
+#include <net/sock.h>
+
+#ifndef VIRTIO_RING_F_EVENT_IDX /* virtio_ring.h of rhel6.0 does not define */
+#define VIRTIO_RING_F_EVENT_IDX 29
+#endif
+#include "vhost.h"
+#include "mic/micveth_dma.h"
+
+#define mic_addr_in_host(va, pa) ((u8 *)(va) + (u64)(pa))
+
+enum {
+ VHOST_MEMORY_MAX_NREGIONS = 64,
+ VHOST_MEMORY_F_LOG = 0x1,
+};
+
+#if 0
+static unsigned vhost_zcopy_mask __read_mostly;
+#endif
+
+static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
+ poll_table *pt)
+{
+ struct vhost_poll *poll;
+ poll = container_of(pt, struct vhost_poll, table);
+
+ poll->wqh = wqh;
+ add_wait_queue(wqh, &poll->wait);
+}
+
+static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
+ void *key)
+{
+ struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
+
+ if (!((unsigned long)key & poll->mask))
+ return 0;
+
+ vhost_poll_queue(poll);
+ return 0;
+}
+
+static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
+{
+ INIT_LIST_HEAD(&work->node);
+ work->fn = fn;
+ init_waitqueue_head(&work->done);
+ work->flushing = 0;
+ work->queue_seq = work->done_seq = 0;
+}
+
+/* Init poll structure */
+void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
+ unsigned long mask, struct vhost_dev *dev)
+{
+ init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
+ init_poll_funcptr(&poll->table, vhost_poll_func);
+ poll->mask = mask;
+ poll->dev = dev;
+
+ vhost_work_init(&poll->work, fn);
+}
+
+#if 0
+/* Start polling a file. We add ourselves to file's wait queue. The caller must
+ * keep a reference to a file until after vhost_poll_stop is called. */
+void vhost_poll_start(struct vhost_poll *poll, struct file *file)
+{
+ unsigned long mask;
+ mask = file->f_op->poll(file, &poll->table);
+ if (mask)
+ vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask);
+}
+#endif
+
+/* Stop polling a file. After this function returns, it becomes safe to drop the
+ * file reference. You must also flush afterwards. */
+void vhost_poll_stop(struct vhost_poll *poll)
+{
+ remove_wait_queue(poll->wqh, &poll->wait);
+}
+
+static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
+ unsigned seq)
+{
+ int left;
+ spin_lock_irq(&dev->work_lock);
+ left = seq - work->done_seq;
+ spin_unlock_irq(&dev->work_lock);
+ return left <= 0;
+}
+
+static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
+{
+ unsigned seq;
+ int flushing;
+
+ spin_lock_irq(&dev->work_lock);
+ seq = work->queue_seq;
+ work->flushing++;
+ spin_unlock_irq(&dev->work_lock);
+ wait_event(work->done, vhost_work_seq_done(dev, work, seq));
+ spin_lock_irq(&dev->work_lock);
+ flushing = --work->flushing;
+ spin_unlock_irq(&dev->work_lock);
+ BUG_ON(flushing < 0);
+}
+
+/* Flush any work that has been scheduled. When calling this, don't hold any
+ * locks that are also used by the callback. */
+void vhost_poll_flush(struct vhost_poll *poll)
+{
+ vhost_work_flush(poll->dev, &poll->work);
+}
+
+static inline void vhost_work_queue(struct vhost_dev *dev,
+ struct vhost_work *work)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->work_lock, flags);
+ if (list_empty(&work->node)) {
+ list_add_tail(&work->node, &dev->work_list);
+ work->queue_seq++;
+ wake_up_process(dev->worker);
+ }
+ spin_unlock_irqrestore(&dev->work_lock, flags);
+}
+
+void vhost_poll_queue(struct vhost_poll *poll)
+{
+ vhost_work_queue(poll->dev, &poll->work);
+}
+
+static void vhost_vq_reset(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
+{
+ vq->num = 1;
+ vq->desc = NULL;
+ vq->avail = NULL;
+ vq->used = NULL;
+ vq->last_avail_idx = 0;
+ vq->avail_idx = 0;
+ vq->last_used_idx = 0;
+ vq->signalled_used = 0;
+ vq->signalled_used_valid = false;
+ vq->used_flags = 0;
+ vq->log_used = false;
+ vq->log_addr = -1ull;
+ vq->vhost_hlen = 0;
+ vq->sock_hlen = 0;
+ vq->private_data = NULL;
+ vq->log_base = NULL;
+ vq->error_ctx = NULL;
+ vq->error = NULL;
+ vq->kick = NULL;
+ vq->call_ctx = NULL;
+ vq->call = NULL;
+ vq->log_ctx = NULL;
+ vq->upend_idx = 0;
+ vq->done_idx = 0;
+ vq->ubufs = NULL;
+}
+
+static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
+{
+ kfree(vq->indirect);
+ vq->indirect = NULL;
+ kfree(vq->log);
+ vq->log = NULL;
+ kfree(vq->heads);
+ vq->heads = NULL;
+ kfree(vq->ubuf_info);
+ vq->ubuf_info = NULL;
+}
+
+#if 0
+void vhost_enable_zcopy(int vq)
+{
+ vhost_zcopy_mask |= 0x1 << vq;
+}
+#endif
+
+static void vhost_dev_free_iovecs(struct vhost_dev *dev)
+{
+ int i;
+ for (i = 0; i < dev->nvqs; ++i)
+ vhost_vq_free_iovecs(&dev->vqs[i]);
+}
+
+long vhost_dev_init(struct vhost_dev *dev,
+ struct vhost_virtqueue *vqs, int nvqs)
+{
+ int i;
+
+ dev->vqs = vqs;
+ dev->nvqs = nvqs;
+ mutex_init(&dev->mutex);
+ dev->log_ctx = NULL;
+ dev->log_file = NULL;
+ dev->memory = NULL;
+ dev->mm = NULL;
+ spin_lock_init(&dev->work_lock);
+ INIT_LIST_HEAD(&dev->work_list);
+ dev->worker = NULL;
+
+ for (i = 0; i < dev->nvqs; ++i) {
+ dev->vqs[i].log = NULL;
+ dev->vqs[i].indirect = NULL;
+ dev->vqs[i].heads = NULL;
+ dev->vqs[i].ubuf_info = NULL;
+ dev->vqs[i].dev = dev;
+ mutex_init(&dev->vqs[i].mutex);
+ vhost_vq_reset(dev, dev->vqs + i);
+ if (dev->vqs[i].handle_kick)
+ vhost_poll_init(&dev->vqs[i].poll,
+ dev->vqs[i].handle_kick, POLLIN, dev);
+ }
+
+ return 0;
+}
+
+#if 0
+/* Caller should have device mutex */
+long vhost_dev_check_owner(struct vhost_dev *dev)
+{
+ /* Are you the owner? If not, I don't think you mean to do that */
+ return dev->mm == current->mm ? 0 : -EPERM;
+}
+#endif
+
+struct vhost_attach_cgroups_struct {
+ struct vhost_work work;
+ struct task_struct *owner;
+ int ret;
+};
+
+#if 0
+/* Caller should have device mutex */
+long vhost_dev_reset_owner(struct vhost_dev *dev)
+{
+ struct vhost_memory *memory;
+
+ /* Restore memory to default empty mapping. */
+ memory = kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL);
+ if (!memory)
+ return -ENOMEM;
+
+ vhost_dev_cleanup(dev);
+
+ memory->nregions = 0;
+ dev->memory = memory;
+ return 0;
+}
+#endif
+
+/* In case of DMA done not in order in lower device driver for some reason.
+ * upend_idx is used to track end of used idx, done_idx is used to track head
+ * of used idx. Once lower device DMA done contiguously, we will signal KVM
+ * guest used idx.
+ */
+int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
+{
+ int i;
+ int j = 0;
+
+ for (i = vq->done_idx; i != vq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
+ if ((vq->heads[i].len == VHOST_DMA_DONE_LEN)) {
+ vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
+ vhost_add_used_and_signal(vq->dev, vq,
+ vq->heads[i].id, 0);
+ ++j;
+ } else
+ break;
+ }
+ if (j)
+ vq->done_idx = i;
+ return j;
+}
+
+/* Caller should have device mutex */
+void vhost_dev_cleanup(struct vhost_dev *dev)
+{
+ int i;
+ for (i = 0; i < dev->nvqs; ++i) {
+ if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
+ vhost_poll_stop(&dev->vqs[i].poll);
+ vhost_poll_flush(&dev->vqs[i].poll);
+ }
+ BUG_ON(dev->vqs[i].ubufs != NULL);
+
+ /* Signal guest as appropriate. */
+ vhost_zerocopy_signal_used(&dev->vqs[i]);
+
+ if (dev->vqs[i].error_ctx)
+ eventfd_ctx_put(dev->vqs[i].error_ctx);
+ if (dev->vqs[i].error)
+ fput(dev->vqs[i].error);
+ if (dev->vqs[i].kick)
+ fput(dev->vqs[i].kick);
+ if (dev->vqs[i].call_ctx)
+ eventfd_ctx_put(dev->vqs[i].call_ctx);
+ if (dev->vqs[i].call)
+ fput(dev->vqs[i].call);
+ vhost_vq_reset(dev, dev->vqs + i);
+ }
+ vhost_dev_free_iovecs(dev);
+ if (dev->log_ctx)
+ eventfd_ctx_put(dev->log_ctx);
+ dev->log_ctx = NULL;
+ if (dev->log_file)
+ fput(dev->log_file);
+ dev->log_file = NULL;
+ /* No one will access memory at this point */
+ kfree(dev->memory);
+ dev->memory = NULL;
+ WARN_ON(!list_empty(&dev->work_list));
+ if (dev->worker) {
+ kthread_stop(dev->worker);
+ dev->worker = NULL;
+ }
+ if (dev->mm)
+ mmput(dev->mm);
+ dev->mm = NULL;
+}
+
+#if 0
+/* Caller must have device mutex */
+long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
+{
+ return 0;
+}
+#endif
+
+static int vhost_update_used_flags(struct vhost_virtqueue *vq)
+{
+ iowrite16(vq->used_flags, mic_addr_in_host(vq->log_addr, &vq->used->flags));
+ return 0;
+}
+
+#if 0
+int vhost_init_used(struct vhost_virtqueue *vq)
+{
+ int r;
+ if (!vq->private_data)
+ return 0;
+
+ r = vhost_update_used_flags(vq);
+ if (r)
+ return r;
+ vq->signalled_used_valid = false;
+ vq->last_used_idx = ioread16(mic_addr_in_host(vq->log_addr, &vq->used->idx));
+ return 0;
+}
+#endif
+
+/* Each buffer in the virtqueues is actually a chain of descriptors. This
+ * function returns the next descriptor in the chain,
+ * or -1U if we're at the end. */
+static unsigned next_desc(struct vring_desc *desc)
+{
+ unsigned int next;
+
+ /* If this descriptor says it doesn't chain, we're done. */
+ if (!(desc->flags & VRING_DESC_F_NEXT))
+ return -1U;
+
+ /* Check they're not leading us off end of descriptors. */
+ next = desc->next;
+ /* Make sure compiler knows to grab that: we don't want it changing! */
+ /* We will use the result as an index in an array, so most
+ * architectures only need a compiler barrier here. */
+ read_barrier_depends();
+
+ return next;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found. A negative code is
+ * returned on error. */
+int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
+{
+ struct vring_desc desc;
+ unsigned int i, head, found = 0;
+ u16 last_avail_idx;
+ int ret;
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ last_avail_idx = vq->last_avail_idx;
+ vq->avail_idx = ioread16(mic_addr_in_host(vq->log_addr, &vq->avail->idx));
+
+ if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
+ vq_err(vq, "Guest moved used index from %u to %u",
+ last_avail_idx, vq->avail_idx);
+ return -EFAULT;
+ }
+
+ /* If there's nothing new since last we looked, return invalid. */
+ if (vq->avail_idx == last_avail_idx)
+ return vq->num;
+
+ /* Only get avail ring entries after they have been exposed by guest. */
+ smp_rmb();
+
+ /* Grab the next descriptor number they're advertising, and increment
+ * the index we've seen. */
+ head = ioread16(mic_addr_in_host(vq->log_addr,
+ &vq->avail->ring[last_avail_idx % vq->num]));
+
+ /* If their number is silly, that's an error. */
+ if (unlikely(head >= vq->num)) {
+ vq_err(vq, "Guest says index %u > %u is available",
+ head, vq->num);
+ return -EINVAL;
+ }
+
+ /* When we start there are none of either input nor output. */
+ *out_num = *in_num = 0;
+ if (unlikely(log))
+ *log_num = 0;
+
+ i = head;
+ do {
+ unsigned iov_count = *in_num + *out_num;
+ if (unlikely(i >= vq->num)) {
+ vq_err(vq, "Desc index is %u > %u, head = %u",
+ i, vq->num, head);
+ return -EINVAL;
+ }
+ if (unlikely(++found > vq->num)) {
+ vq_err(vq, "Loop detected: last one at %u "
+ "vq size %u head %u\n",
+ i, vq->num, head);
+ return -EINVAL;
+ }
+ memcpy_fromio(&desc, mic_addr_in_host(vq->log_addr, vq->desc + i), sizeof(desc));
+
+ (iov + iov_count)->iov_base = (void *)desc.addr;
+ (iov + iov_count)->iov_len = desc.len;
+ ret = 1;
+ if (desc.flags & VRING_DESC_F_WRITE) {
+ /* If this is an input descriptor,
+ * increment that count. */
+ *in_num += ret;
+ if (unlikely(log)) {
+ log[*log_num].addr = desc.addr;
+ log[*log_num].len = desc.len;
+ ++*log_num;
+ }
+ } else {
+ /* If it's an output descriptor, they're all supposed
+ * to come before any input descriptors. */
+ if (unlikely(*in_num)) {
+ vq_err(vq, "Descriptor has out after in: "
+ "idx %d\n", i);
+ return -EINVAL;
+ }
+ *out_num += ret;
+ }
+ } while ((i = next_desc(&desc)) != -1);
+
+ /* On success, increment avail index. */
+ vq->last_avail_idx++;
+
+ /* Assume notifications from guest are disabled at this point,
+ * if they aren't we would need to update avail_event index. */
+ BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
+ return head;
+}
+
+/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
+void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
+{
+ vq->last_avail_idx -= n;
+}
+
+/* After we've used one of their buffers, we tell them about it. We'll then
+ * want to notify the guest, using eventfd. */
+int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
+{
+ struct vring_used_elem __user *used;
+
+ /* The virtqueue contains a ring of used buffers. Get a pointer to the
+ * next entry in that used ring. */
+ used = &vq->used->ring[vq->last_used_idx % vq->num];
+ iowrite16(head, mic_addr_in_host(vq->log_addr, &used->id));
+ iowrite16(len, mic_addr_in_host(vq->log_addr, &used->len));
+ /* Make sure buffer is written before we update index. */
+ smp_wmb();
+ ioread16(mic_addr_in_host(vq->log_addr, &used->id));
+ iowrite16(vq->last_used_idx + 1, mic_addr_in_host(vq->log_addr, &vq->used->idx));
+
+ vq->last_used_idx++;
+
+ /* If the driver never bothers to signal in a very long while,
+ * used index might wrap around. If that happens, invalidate
+ * signalled_used index we stored. TODO: make sure driver
+ * signals at least once in 2^16 and remove this. */
+ if (unlikely(vq->last_used_idx == vq->signalled_used))
+ vq->signalled_used_valid = false;
+ return 0;
+}
+
+static int __vhost_add_used_n(struct vhost_virtqueue *vq,
+ struct vring_used_elem *heads,
+ unsigned count)
+{
+ struct vring_used_elem __user *used;
+ u16 old, new;
+ int start;
+
+ start = vq->last_used_idx % vq->num;
+ used = vq->used->ring + start;
+ memcpy_toio(mic_addr_in_host(vq->log_addr, used), heads, count * sizeof(*used));
+ old = vq->last_used_idx;
+ new = (vq->last_used_idx += count);
+ /* If the driver never bothers to signal in a very long while,
+ * used index might wrap around. If that happens, invalidate
+ * signalled_used index we stored. TODO: make sure driver
+ * signals at least once in 2^16 and remove this. */
+ if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
+ vq->signalled_used_valid = false;
+ return 0;
+}
+
+/* After we've used one of their buffers, we tell them about it. We'll then
+ * want to notify the guest, using eventfd. */
+int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
+ unsigned count)
+{
+ int start, n, r;
+
+ start = vq->last_used_idx % vq->num;
+ n = vq->num - start;
+ if (n < count) {
+ r = __vhost_add_used_n(vq, heads, n);
+ if (r < 0)
+ return r;
+ heads += n;
+ count -= n;
+ }
+ r = __vhost_add_used_n(vq, heads, count);
+
+ /* Make sure buffer is written before we update index. */
+ smp_wmb();
+ iowrite16(vq->last_used_idx, mic_addr_in_host(vq->log_addr, &vq->used->idx));
+ return r;
+}
+
+static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+ __u16 old, new;
+ bool v;
+ /* Flush out used index updates. This is paired
+ * with the barrier that the Guest executes when enabling
+ * interrupts. */
+ smp_mb();
+
+ if (vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
+ unlikely(vq->avail_idx == vq->last_avail_idx))
+ return true;
+
+ if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
+ __u16 flags;
+ flags = ioread16(mic_addr_in_host(vq->log_addr, &vq->avail->flags));
+ return !(flags & VRING_AVAIL_F_NO_INTERRUPT);
+ }
+ old = vq->signalled_used;
+ v = vq->signalled_used_valid;
+ new = vq->signalled_used = vq->last_used_idx;
+ vq->signalled_used_valid = true;
+
+ if (unlikely(!v))
+ return true;
+
+ return false;
+}
+
+/* This actually signals the guest, using eventfd. */
+void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+ /* Signal the Guest tell them we used something up. */
+ if (vq->log_base && vhost_notify(dev, vq))
+ mic_send_virtio_intr((struct _mic_ctx_t *)vq->log_base);
+}
+
+/* And here's the combo meal deal. Supersize me! */
+void vhost_add_used_and_signal(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq,
+ unsigned int head, int len)
+{
+ vhost_add_used(vq, head, len);
+ vhost_signal(dev, vq);
+}
+
+#if 0
+/* multi-buffer version of vhost_add_used_and_signal */
+void vhost_add_used_and_signal_n(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq,
+ struct vring_used_elem *heads, unsigned count)
+{
+ vhost_add_used_n(vq, heads, count);
+ vhost_signal(dev, vq);
+}
+#endif
+
+/* OK, now we need to know about added descriptors. */
+bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+ u16 avail_idx;
+ int r;
+ if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
+ return false;
+ vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
+ if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
+ r = vhost_update_used_flags(vq);
+ if (r) {
+ vq_err(vq, "Failed to enable notification at %p: %d\n",
+ &vq->used->flags, r);
+ return false;
+ }
+ }
+ /* They could have slipped one in as we were doing that: make
+ * sure it's written, then check again. */
+ smp_mb();
+ avail_idx = ioread16(mic_addr_in_host(vq->log_addr, &vq->avail->idx));
+
+ return avail_idx != vq->avail_idx;
+}
+
+/* We don't need to be notified again. */
+void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+ int r;
+ if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
+ return;
+ vq->used_flags |= VRING_USED_F_NO_NOTIFY;
+ if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
+ r = vhost_update_used_flags(vq);
+ if (r)
+ vq_err(vq, "Failed to enable notification at %p: %d\n",
+ &vq->used->flags, r);
+ }
+}
+#endif
--- /dev/null
+/*
+ This is the exact copy of linux-2.6.32-220.7.1.el6.x86_64/drivers/vhost/vhost.h
+ except for this comment.
+ */
+#ifndef _VHOST_H
+#define _VHOST_H
+
+#include <linux/eventfd.h>
+#ifdef RHEL_RELEASE_CODE
+#include <linux/vhost.h>
+#else
+#include "./linux/vhost.h"
+#endif
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/skbuff.h>
+#include <linux/uio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ring.h>
+#include <asm/atomic.h>
+
+/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
+ * done */
+#define VHOST_DMA_DONE_LEN 1
+#define VHOST_DMA_CLEAR_LEN 0
+
+struct vhost_device;
+
+struct vhost_work;
+typedef void (*vhost_work_fn_t)(struct vhost_work *work);
+
+struct vhost_work {
+ struct list_head node;
+ vhost_work_fn_t fn;
+ wait_queue_head_t done;
+ int flushing;
+ unsigned queue_seq;
+ unsigned done_seq;
+};
+
+/* Poll a file (eventfd or socket) */
+/* Note: there's nothing vhost specific about this structure. */
+struct vhost_poll {
+ poll_table table;
+ wait_queue_head_t *wqh;
+ wait_queue_t wait;
+ struct vhost_work work;
+ unsigned long mask;
+ struct vhost_dev *dev;
+};
+
+void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
+ unsigned long mask, struct vhost_dev *dev);
+void vhost_poll_start(struct vhost_poll *poll, struct file *file);
+void vhost_poll_stop(struct vhost_poll *poll);
+void vhost_poll_flush(struct vhost_poll *poll);
+void vhost_poll_queue(struct vhost_poll *poll);
+
+struct vhost_log {
+ u64 addr;
+ u64 len;
+};
+
+struct vhost_virtqueue;
+
+struct vhost_ubuf_ref {
+ struct kref kref;
+ wait_queue_head_t wait;
+ struct vhost_virtqueue *vq;
+};
+
+struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *, bool zcopy);
+void vhost_ubuf_put(struct vhost_ubuf_ref *);
+void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *);
+
+/* The virtqueue structure describes a queue attached to a device. */
+struct vhost_virtqueue {
+ struct vhost_dev *dev;
+
+ /* The actual ring of buffers. */
+ struct mutex mutex;
+ unsigned int num;
+ struct vring_desc __user *desc;
+ struct vring_avail __user *avail;
+ struct vring_used __user *used;
+ struct file *kick;
+ struct file *call;
+ struct file *error;
+ struct eventfd_ctx *call_ctx;
+ struct eventfd_ctx *error_ctx;
+ struct eventfd_ctx *log_ctx;
+
+ struct vhost_poll poll;
+
+ /* The routine to call when the Guest pings us, or timeout. */
+ vhost_work_fn_t handle_kick;
+
+ /* Last available index we saw. */
+ u16 last_avail_idx;
+
+ /* Caches available index value from user. */
+ u16 avail_idx;
+
+ /* Last index we used. */
+ u16 last_used_idx;
+
+ /* Used flags */
+ u16 used_flags;
+
+ /* Last used index value we have signalled on */
+ u16 signalled_used;
+
+ /* Last used index value we have signalled on */
+ bool signalled_used_valid;
+
+ /* Log writes to used structure. */
+ bool log_used;
+ u64 log_addr;
+
+ struct iovec iov[UIO_MAXIOV];
+ /* hdr is used to store the virtio header.
+ * Since each iovec has >= 1 byte length, we never need more than
+ * header length entries to store the header. */
+ struct iovec hdr[sizeof(struct virtio_net_hdr_mrg_rxbuf)];
+ struct iovec *indirect;
+ size_t vhost_hlen;
+ size_t sock_hlen;
+ struct vring_used_elem *heads;
+ /* We use a kind of RCU to access private pointer.
+ * All readers access it from worker, which makes it possible to
+ * flush the vhost_work instead of synchronize_rcu. Therefore readers do
+ * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
+ * vhost_work execution acts instead of rcu_read_lock() and the end of
+ * vhost_work execution acts instead of rcu_read_lock().
+ * Writers use virtqueue mutex. */
+ void *private_data;
+ /* Log write descriptors */
+ void __user *log_base;
+ struct vhost_log *log;
+ /* vhost zerocopy support fields below: */
+ /* last used idx for outstanding DMA zerocopy buffers */
+ int upend_idx;
+ /* first used idx for DMA done zerocopy buffers */
+ int done_idx;
+ /* an array of userspace buffers info */
+ struct ubuf_info *ubuf_info;
+ /* Reference counting for outstanding ubufs.
+ * Protected by vq mutex. Writers must also take device mutex. */
+ struct vhost_ubuf_ref *ubufs;
+};
+
+struct vhost_dev {
+ /* Readers use RCU to access memory table pointer
+ * log base pointer and features.
+ * Writers use mutex below.*/
+ struct vhost_memory *memory;
+ struct mm_struct *mm;
+ struct mutex mutex;
+ unsigned acked_features;
+ struct vhost_virtqueue *vqs;
+ int nvqs;
+ struct file *log_file;
+ struct eventfd_ctx *log_ctx;
+ spinlock_t work_lock;
+ struct list_head work_list;
+ struct task_struct *worker;
+};
+
+long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
+long vhost_dev_check_owner(struct vhost_dev *);
+long vhost_dev_reset_owner(struct vhost_dev *);
+void vhost_dev_cleanup(struct vhost_dev *);
+long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg);
+int vhost_vq_access_ok(struct vhost_virtqueue *vq);
+int vhost_log_access_ok(struct vhost_dev *);
+
+int vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
+ struct iovec iov[], unsigned int iov_count,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num);
+void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
+
+int vhost_init_used(struct vhost_virtqueue *);
+int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
+int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
+ unsigned count);
+void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
+ unsigned int id, int len);
+void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
+ struct vring_used_elem *heads, unsigned count);
+void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
+void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
+bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
+
+int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
+ unsigned int log_num, u64 len);
+void vhost_zerocopy_callback(void *arg);
+int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
+
+#define vq_err(vq, fmt, ...) do { \
+ pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
+ if ((vq)->error_ctx) \
+ eventfd_signal((vq)->error_ctx, 1);\
+ } while (0)
+
+#ifndef __rcu_dereference_index_check
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0))
+#define __rcu_dereference_index_check(p, c) \
+ ({ \
+ typeof(p) _________p1 = ACCESS_ONCE(p); \
+ rcu_lockdep_assert(c, \
+ "suspicious rcu_dereference_index_check()" \
+ " usage"); \
+ smp_read_barrier_depends(); \
+ (_________p1); \
+ })
+#else
+#define __rcu_dereference_index_check(p, c) \
+ ({ \
+ typeof(p) _________p1 = ACCESS_ONCE(p); \
+ RCU_LOCKDEP_WARN(c, \
+ "suspicious rcu_dereference_index_check()" \
+ " usage"); \
+ smp_read_barrier_depends(); \
+ (_________p1); \
+ })
+#endif
+#endif
+
+enum {
+ VHOST_FEATURES = (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+ (1ULL << VIRTIO_RING_F_EVENT_IDX) |
+ (1ULL << VHOST_F_LOG_ALL) |
+ (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
+ (1ULL << VIRTIO_NET_F_MRG_RXBUF),
+};
+
+static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
+{
+#ifdef RHEL_RELEASE_CODE
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ unsigned acked_features = rcu_dereference_index_check(dev->acked_features, rcu_read_lock_held());
+#else
+ unsigned acked_features = rcu_dereference(dev->acked_features);
+#endif
+#else
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0))
+ unsigned acked_features = rcu_dereference_index_check(dev->acked_features, rcu_read_lock_held());
+#else
+ unsigned acked_features = __rcu_dereference_index_check(dev->acked_features, rcu_read_lock_held());
+#endif
+#endif
+ return acked_features & (1 << bit);
+}
+
+void vhost_enable_zcopy(int vq);
+
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * fs/proc/vmcore.c Interface for accessing the crash
+ * dump from the system's previous life.
+ * Heavily borrowed from fs/proc/kcore.c
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ * Copyright (C) IBM Corporation, 2004. All rights reserved
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/user.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/crash_dump.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+#include <linux/kcore.h>
+#endif
+#include "mic_common.h"
+
+extern struct proc_dir_entry *vmcore_dir;
+
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = 0x50e9000;
+
+/**
+ * mic_copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t mic_copy_oldmem_page(mic_ctx_t *mic_ctx,
+ unsigned long pfn, char *buf,
+ size_t csize, unsigned long offset, int userbuf)
+{
+ void *vaddr, *tmp;
+ int err;
+ struct dma_channel *dma_chan;
+ dma_addr_t mic_dst_phys_addr;
+
+ vaddr = mic_ctx->aper.va + (pfn << PAGE_SHIFT);
+
+ if (!csize)
+ return 0;
+ if (csize == PAGE_SIZE && !offset) {
+ if (!(tmp = (void*)__get_free_pages(GFP_KERNEL, get_order(PAGE_SIZE)))) {
+ printk(KERN_ERR "%s: tmp buffer allocation failed\n", __func__);
+ return -ENOMEM;
+ }
+ mic_dst_phys_addr = mic_ctx_map_single(mic_ctx, tmp, csize);
+ if (mic_map_error(mic_dst_phys_addr)) {
+ printk(KERN_ERR "%s: mic_ctx_map_single failed\n", __func__);
+ free_pages((unsigned long)tmp, get_order(PAGE_SIZE));
+ return -ENOMEM;
+ }
+
+ if ((allocate_dma_channel(mic_ctx->dma_handle, &dma_chan))) {
+ printk(KERN_ERR "%s: allocate_dma_channel failed\n", __func__);
+ mic_ctx_unmap_single(mic_ctx, mic_dst_phys_addr, csize);
+ free_pages((unsigned long)tmp, get_order(PAGE_SIZE));
+ return -EBUSY;
+ }
+
+ err = do_dma(dma_chan,
+ 0,
+ pfn << PAGE_SHIFT,
+ mic_dst_phys_addr,
+ csize,
+ NULL);
+ if (err) {
+ printk(KERN_ERR "DMA do_dma err %s %d err %d src 0x%lx "
+ "dst 0x%llx csize 0x%lx\n",
+ __func__, __LINE__, err, pfn << PAGE_SHIFT,
+ mic_dst_phys_addr, csize);
+ free_dma_channel(dma_chan);
+ mic_ctx_unmap_single(mic_ctx, mic_dst_phys_addr, csize);
+ free_pages((unsigned long)tmp, get_order(PAGE_SIZE));
+ return err;
+ }
+ free_dma_channel(dma_chan);
+ err = drain_dma_poll(dma_chan);
+ if (err) {
+ printk(KERN_ERR "DMA poll err %s %d err %d src 0x%lx i"
+ "dst 0x%llx csize 0x%lx\n",
+ __func__, __LINE__, err, pfn << PAGE_SHIFT,
+ mic_dst_phys_addr, csize);
+ mic_ctx_unmap_single(mic_ctx, mic_dst_phys_addr, csize);
+ free_pages((unsigned long)tmp, get_order(PAGE_SIZE));
+ return err;
+ }
+ if (userbuf) {
+ if (copy_to_user(buf, tmp, csize)) {
+ mic_ctx_unmap_single(mic_ctx, mic_dst_phys_addr, csize);
+ free_pages((unsigned long)tmp, get_order(PAGE_SIZE));
+ return -EFAULT;
+ }
+ } else {
+ memcpy(buf, tmp, csize);
+ }
+ smp_mb();
+ mic_ctx_unmap_single(mic_ctx, mic_dst_phys_addr, csize);
+ free_pages((unsigned long)tmp, get_order(PAGE_SIZE));
+ } else {
+ if (userbuf) {
+ if (copy_to_user(buf, vaddr + offset, csize))
+ return -EFAULT;
+ } else
+ memcpy_fromio(buf, vaddr + offset, csize);
+ }
+ return csize;
+}
+
+/* Reads a page from the oldmem device from given offset. */
+static ssize_t read_from_oldmem(mic_ctx_t *mic_ctx,
+ char *buf, size_t count,
+ u64 *ppos, int userbuf)
+{
+ unsigned long pfn, offset;
+ size_t nr_bytes;
+ ssize_t read = 0, tmp;
+
+ if (!count)
+ return 0;
+
+ offset = (unsigned long)(*ppos % PAGE_SIZE);
+ pfn = (unsigned long)(*ppos / PAGE_SIZE);
+
+ do {
+ if (count > (PAGE_SIZE - offset))
+ nr_bytes = PAGE_SIZE - offset;
+ else
+ nr_bytes = count;
+
+ tmp = mic_copy_oldmem_page(mic_ctx, pfn, buf, nr_bytes, offset, userbuf);
+ if (tmp < 0)
+ return tmp;
+ *ppos += nr_bytes;
+ count -= nr_bytes;
+ buf += nr_bytes;
+ read += nr_bytes;
+ ++pfn;
+ offset = 0;
+ } while (count);
+
+ return read;
+}
+
+/* Maps vmcore file offset to respective physical address in memroy. */
+static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
+ struct vmcore **m_ptr)
+{
+ struct vmcore *m;
+ u64 paddr;
+
+ list_for_each_entry(m, vc_list, list) {
+ u64 start, end;
+ start = m->offset;
+ end = m->offset + m->size - 1;
+ if (offset >= start && offset <= end) {
+ paddr = m->paddr + offset - start;
+ *m_ptr = m;
+ return paddr;
+ }
+ }
+ *m_ptr = NULL;
+ return 0;
+}
+
+/* Read from the ELF header and then the crash dump. On error, negative value is
+ * returned otherwise number of bytes read are returned.
+ */
+static ssize_t read_vmcore(struct file *file, char __user *buffer,
+ size_t buflen, loff_t *fpos)
+{
+ ssize_t acc = 0, tmp;
+ size_t tsz;
+ u64 start, nr_bytes;
+ struct vmcore *curr_m = NULL;
+ struct inode *inode = file->f_path.dentry->d_inode;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ mic_ctx_t *mic_ctx = PDE_DATA(inode);
+#else
+ struct proc_dir_entry *entry = PDE(inode);
+ mic_ctx_t *mic_ctx = entry->data;
+#endif
+
+ if (buflen == 0 || *fpos >= mic_ctx->vmcore_size)
+ return 0;
+
+ /* trim buflen to not go beyond EOF */
+ if (buflen > mic_ctx->vmcore_size - *fpos)
+ buflen = mic_ctx->vmcore_size - *fpos;
+
+ /* Read ELF core header */
+ if (*fpos < mic_ctx->elfcorebuf_sz) {
+ tsz = mic_ctx->elfcorebuf_sz - *fpos;
+ if (buflen < tsz)
+ tsz = buflen;
+ if (copy_to_user(buffer, mic_ctx->elfcorebuf + *fpos, tsz))
+ return -EFAULT;
+ buflen -= tsz;
+ *fpos += tsz;
+ buffer += tsz;
+ acc += tsz;
+
+ /* leave now if filled buffer already */
+ if (buflen == 0)
+ return acc;
+ }
+
+ start = map_offset_to_paddr(*fpos, &mic_ctx->vmcore_list, &curr_m);
+ if (!curr_m)
+ return -EINVAL;
+ if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
+ tsz = buflen;
+
+ /* Calculate left bytes in current memory segment. */
+ nr_bytes = (curr_m->size - (start - curr_m->paddr));
+ if (tsz > nr_bytes)
+ tsz = nr_bytes;
+
+ while (buflen) {
+ tmp = read_from_oldmem(mic_ctx,buffer, tsz, &start, 1);
+ if (tmp < 0)
+ return tmp;
+ buflen -= tsz;
+ *fpos += tsz;
+ buffer += tsz;
+ acc += tsz;
+ if (start >= (curr_m->paddr + curr_m->size)) {
+ if (curr_m->list.next == &mic_ctx->vmcore_list)
+ return acc; /*EOF*/
+ curr_m = list_entry(curr_m->list.next,
+ struct vmcore, list);
+ start = curr_m->paddr;
+ }
+ if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
+ tsz = buflen;
+ /* Calculate left bytes in current memory segment. */
+ nr_bytes = (curr_m->size - (start - curr_m->paddr));
+ if (tsz > nr_bytes)
+ tsz = nr_bytes;
+ }
+ return acc;
+}
+
+static const struct file_operations proc_vmcore_operations = {
+ .read = read_vmcore,
+};
+
+static struct vmcore* get_new_element(void)
+{
+ return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
+}
+
+static u64 get_vmcore_size_elf64(char *elfptr)
+{
+ int i;
+ u64 size;
+ Elf64_Ehdr *ehdr_ptr;
+ Elf64_Phdr *phdr_ptr;
+
+ ehdr_ptr = (Elf64_Ehdr *)elfptr;
+ phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
+ size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr));
+ for (i = 0; i < ehdr_ptr->e_phnum; i++) {
+ size += phdr_ptr->p_memsz;
+ phdr_ptr++;
+ }
+ return size;
+}
+
+static u64 get_vmcore_size_elf32(char *elfptr)
+{
+ int i;
+ u64 size;
+ Elf32_Ehdr *ehdr_ptr;
+ Elf32_Phdr *phdr_ptr;
+
+ ehdr_ptr = (Elf32_Ehdr *)elfptr;
+ phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
+ size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr));
+ for (i = 0; i < ehdr_ptr->e_phnum; i++) {
+ size += phdr_ptr->p_memsz;
+ phdr_ptr++;
+ }
+ return size;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int merge_note_headers_elf64(mic_ctx_t *mic_ctx,
+ char *elfptr, size_t *elfsz,
+ struct list_head *vc_list)
+{
+ int i, nr_ptnote=0, rc=0;
+ char *tmp;
+ Elf64_Ehdr *ehdr_ptr;
+ Elf64_Phdr phdr, *phdr_ptr;
+ Elf64_Nhdr *nhdr_ptr;
+ u64 phdr_sz = 0, note_off;
+
+ ehdr_ptr = (Elf64_Ehdr *)elfptr;
+ phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ int j;
+ void *notes_section;
+ struct vmcore *new;
+ u64 offset, max_sz, sz, real_sz = 0;
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ nr_ptnote++;
+ max_sz = phdr_ptr->p_memsz;
+ offset = phdr_ptr->p_offset;
+ notes_section = kmalloc(max_sz, GFP_KERNEL);
+ if (!notes_section)
+ return -ENOMEM;
+ rc = read_from_oldmem(mic_ctx, notes_section, max_sz, &offset, 0);
+ if (rc < 0) {
+ kfree(notes_section);
+ return rc;
+ }
+ nhdr_ptr = notes_section;
+ for (j = 0; j < max_sz; j += sz) {
+ if (nhdr_ptr->n_namesz == 0)
+ break;
+ sz = sizeof(Elf64_Nhdr) +
+ ((nhdr_ptr->n_namesz + 3) & ~3) +
+ ((nhdr_ptr->n_descsz + 3) & ~3);
+ real_sz += sz;
+ nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
+ }
+
+ /* Add this contiguous chunk of notes section to vmcore list.*/
+ new = get_new_element();
+ if (!new) {
+ kfree(notes_section);
+ return -ENOMEM;
+ }
+ new->paddr = phdr_ptr->p_offset;
+ new->size = real_sz;
+ list_add_tail(&new->list, vc_list);
+ phdr_sz += real_sz;
+ kfree(notes_section);
+ }
+
+ /* Prepare merged PT_NOTE program header. */
+ phdr.p_type = PT_NOTE;
+ phdr.p_flags = 0;
+ note_off = sizeof(Elf64_Ehdr) +
+ (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
+ phdr.p_offset = note_off;
+ phdr.p_vaddr = phdr.p_paddr = 0;
+ phdr.p_filesz = phdr.p_memsz = phdr_sz;
+ phdr.p_align = 0;
+
+ /* Add merged PT_NOTE program header*/
+ tmp = elfptr + sizeof(Elf64_Ehdr);
+ memcpy(tmp, &phdr, sizeof(phdr));
+ tmp += sizeof(phdr);
+
+ /* Remove unwanted PT_NOTE program headers. */
+ i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
+ *elfsz = *elfsz - i;
+ memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
+
+ /* Modify e_phnum to reflect merged headers. */
+ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+
+ return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int merge_note_headers_elf32(mic_ctx_t *mic_ctx,
+ char *elfptr, size_t *elfsz,
+ struct list_head *vc_list)
+{
+ int i, nr_ptnote=0, rc=0;
+ char *tmp;
+ Elf32_Ehdr *ehdr_ptr;
+ Elf32_Phdr phdr, *phdr_ptr;
+ Elf32_Nhdr *nhdr_ptr;
+ u64 phdr_sz = 0, note_off;
+
+ ehdr_ptr = (Elf32_Ehdr *)elfptr;
+ phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ int j;
+ void *notes_section;
+ struct vmcore *new;
+ u64 offset, max_sz, sz, real_sz = 0;
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ nr_ptnote++;
+ max_sz = phdr_ptr->p_memsz;
+ offset = phdr_ptr->p_offset;
+ notes_section = kmalloc(max_sz, GFP_KERNEL);
+ if (!notes_section)
+ return -ENOMEM;
+ rc = read_from_oldmem(mic_ctx, notes_section, max_sz, &offset, 0);
+ if (rc < 0) {
+ kfree(notes_section);
+ return rc;
+ }
+ nhdr_ptr = notes_section;
+ for (j = 0; j < max_sz; j += sz) {
+ if (nhdr_ptr->n_namesz == 0)
+ break;
+ sz = sizeof(Elf32_Nhdr) +
+ ((nhdr_ptr->n_namesz + 3) & ~3) +
+ ((nhdr_ptr->n_descsz + 3) & ~3);
+ real_sz += sz;
+ nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
+ }
+
+ /* Add this contiguous chunk of notes section to vmcore list.*/
+ new = get_new_element();
+ if (!new) {
+ kfree(notes_section);
+ return -ENOMEM;
+ }
+ new->paddr = phdr_ptr->p_offset;
+ new->size = real_sz;
+ list_add_tail(&new->list, vc_list);
+ phdr_sz += real_sz;
+ kfree(notes_section);
+ }
+
+ /* Prepare merged PT_NOTE program header. */
+ phdr.p_type = PT_NOTE;
+ phdr.p_flags = 0;
+ note_off = sizeof(Elf32_Ehdr) +
+ (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr);
+ phdr.p_offset = note_off;
+ phdr.p_vaddr = phdr.p_paddr = 0;
+ phdr.p_filesz = phdr.p_memsz = phdr_sz;
+ phdr.p_align = 0;
+
+ /* Add merged PT_NOTE program header*/
+ tmp = elfptr + sizeof(Elf32_Ehdr);
+ memcpy(tmp, &phdr, sizeof(phdr));
+ tmp += sizeof(phdr);
+
+ /* Remove unwanted PT_NOTE program headers. */
+ i = (nr_ptnote - 1) * sizeof(Elf32_Phdr);
+ *elfsz = *elfsz - i;
+ memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr)));
+
+ /* Modify e_phnum to reflect merged headers. */
+ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+
+ return 0;
+}
+
+/* Add memory chunks represented by program headers to vmcore list. Also update
+ * the new offset fields of exported program headers. */
+static int process_ptload_program_headers_elf64(char *elfptr,
+ size_t elfsz,
+ struct list_head *vc_list)
+{
+ int i;
+ Elf64_Ehdr *ehdr_ptr;
+ Elf64_Phdr *phdr_ptr;
+ loff_t vmcore_off;
+ struct vmcore *new;
+
+ ehdr_ptr = (Elf64_Ehdr *)elfptr;
+ phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
+
+ /* First program header is PT_NOTE header. */
+ vmcore_off = sizeof(Elf64_Ehdr) +
+ (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) +
+ phdr_ptr->p_memsz; /* Note sections */
+
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ if (phdr_ptr->p_type != PT_LOAD)
+ continue;
+
+ /* Add this contiguous chunk of memory to vmcore list.*/
+ new = get_new_element();
+ if (!new)
+ return -ENOMEM;
+ new->paddr = phdr_ptr->p_offset;
+ new->size = phdr_ptr->p_memsz;
+ list_add_tail(&new->list, vc_list);
+
+ /* Update the program header offset. */
+ phdr_ptr->p_offset = vmcore_off;
+ vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+ }
+ return 0;
+}
+
+static int process_ptload_program_headers_elf32(char *elfptr,
+ size_t elfsz,
+ struct list_head *vc_list)
+{
+ int i;
+ Elf32_Ehdr *ehdr_ptr;
+ Elf32_Phdr *phdr_ptr;
+ loff_t vmcore_off;
+ struct vmcore *new;
+
+ ehdr_ptr = (Elf32_Ehdr *)elfptr;
+ phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
+
+ /* First program header is PT_NOTE header. */
+ vmcore_off = sizeof(Elf32_Ehdr) +
+ (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) +
+ phdr_ptr->p_memsz; /* Note sections */
+
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ if (phdr_ptr->p_type != PT_LOAD)
+ continue;
+
+ /* Add this contiguous chunk of memory to vmcore list.*/
+ new = get_new_element();
+ if (!new)
+ return -ENOMEM;
+ new->paddr = phdr_ptr->p_offset;
+ new->size = phdr_ptr->p_memsz;
+ list_add_tail(&new->list, vc_list);
+
+ /* Update the program header offset */
+ phdr_ptr->p_offset = vmcore_off;
+ vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+ }
+ return 0;
+}
+
+/* Sets offset fields of vmcore elements. */
+static void set_vmcore_list_offsets_elf64(char *elfptr,
+ struct list_head *vc_list)
+{
+ loff_t vmcore_off;
+ Elf64_Ehdr *ehdr_ptr;
+ struct vmcore *m;
+
+ ehdr_ptr = (Elf64_Ehdr *)elfptr;
+
+ /* Skip Elf header and program headers. */
+ vmcore_off = sizeof(Elf64_Ehdr) +
+ (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr);
+
+ list_for_each_entry(m, vc_list, list) {
+ m->offset = vmcore_off;
+ vmcore_off += m->size;
+ }
+}
+
+/* Sets offset fields of vmcore elements. */
+static void set_vmcore_list_offsets_elf32(char *elfptr,
+ struct list_head *vc_list)
+{
+ loff_t vmcore_off;
+ Elf32_Ehdr *ehdr_ptr;
+ struct vmcore *m;
+
+ ehdr_ptr = (Elf32_Ehdr *)elfptr;
+
+ /* Skip Elf header and program headers. */
+ vmcore_off = sizeof(Elf32_Ehdr) +
+ (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr);
+
+ list_for_each_entry(m, vc_list, list) {
+ m->offset = vmcore_off;
+ vmcore_off += m->size;
+ }
+}
+
+static int parse_crash_elf64_headers(mic_ctx_t *mic_ctx)
+{
+ int rc=0;
+ Elf64_Ehdr ehdr;
+ u64 addr;
+
+ addr = elfcorehdr_addr;
+
+ /* Read Elf header */
+ rc = read_from_oldmem(mic_ctx, (char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0);
+ if (rc < 0)
+ return rc;
+
+ /* Do some basic Verification. */
+ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
+ (ehdr.e_type != ET_CORE) ||
+#ifdef CONFIG_CRASH_DUMP
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,36))
+ !vmcore_elf64_check_arch(&ehdr) ||
+#else
+ !vmcore_elf_check_arch(&ehdr) ||
+#endif
+#else
+ !elf_check_arch(&ehdr) ||
+#endif
+ ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
+ ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
+ ehdr.e_version != EV_CURRENT ||
+ ehdr.e_ehsize != sizeof(Elf64_Ehdr) ||
+ ehdr.e_phentsize != sizeof(Elf64_Phdr) ||
+ ehdr.e_phnum == 0) {
+ printk(KERN_WARNING "Warning: Core image elf header is not"
+ "sane\n");
+ return -EINVAL;
+ }
+
+ WARN_ON(mic_ctx->elfcorebuf);
+ /* Read in all elf headers. */
+ mic_ctx->elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr);
+ mic_ctx->elfcorebuf = kmalloc(mic_ctx->elfcorebuf_sz, GFP_KERNEL);
+ if (!mic_ctx->elfcorebuf)
+ return -ENOMEM;
+ addr = elfcorehdr_addr;
+ rc = read_from_oldmem(mic_ctx, mic_ctx->elfcorebuf, mic_ctx->elfcorebuf_sz, &addr, 0);
+ if (rc < 0) {
+ kfree(mic_ctx->elfcorebuf);
+ mic_ctx->elfcorebuf = NULL;
+ return rc;
+ }
+
+ /* Merge all PT_NOTE headers into one. */
+ rc = merge_note_headers_elf64(mic_ctx, mic_ctx->elfcorebuf, &mic_ctx->elfcorebuf_sz, &mic_ctx->vmcore_list);
+ if (rc) {
+ kfree(mic_ctx->elfcorebuf);
+ mic_ctx->elfcorebuf = NULL;
+ return rc;
+ }
+ rc = process_ptload_program_headers_elf64(mic_ctx->elfcorebuf, mic_ctx->elfcorebuf_sz,
+ &mic_ctx->vmcore_list);
+ if (rc) {
+ kfree(mic_ctx->elfcorebuf);
+ mic_ctx->elfcorebuf = NULL;
+ return rc;
+ }
+ set_vmcore_list_offsets_elf64(mic_ctx->elfcorebuf, &mic_ctx->vmcore_list);
+ return 0;
+}
+
+static int parse_crash_elf32_headers(mic_ctx_t *mic_ctx)
+{
+ int rc=0;
+ Elf32_Ehdr ehdr;
+ u64 addr;
+
+ addr = elfcorehdr_addr;
+
+ /* Read Elf header */
+ rc = read_from_oldmem(mic_ctx, (char*)&ehdr, sizeof(Elf32_Ehdr), &addr, 0);
+ if (rc < 0)
+ return rc;
+
+ /* Do some basic Verification. */
+ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
+ (ehdr.e_type != ET_CORE) ||
+ !elf_check_arch(&ehdr) ||
+ ehdr.e_ident[EI_CLASS] != ELFCLASS32||
+ ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
+ ehdr.e_version != EV_CURRENT ||
+ ehdr.e_ehsize != sizeof(Elf32_Ehdr) ||
+ ehdr.e_phentsize != sizeof(Elf32_Phdr) ||
+ ehdr.e_phnum == 0) {
+ printk(KERN_WARNING "Warning: Core image elf header is not"
+ "sane\n");
+ return -EINVAL;
+ }
+
+ WARN_ON(mic_ctx->elfcorebuf);
+ /* Read in all elf headers. */
+ mic_ctx->elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
+ mic_ctx->elfcorebuf = kmalloc(mic_ctx->elfcorebuf_sz, GFP_KERNEL);
+ if (!mic_ctx->elfcorebuf)
+ return -ENOMEM;
+ addr = elfcorehdr_addr;
+ rc = read_from_oldmem(mic_ctx, mic_ctx->elfcorebuf, mic_ctx->elfcorebuf_sz, &addr, 0);
+ if (rc < 0) {
+ kfree(mic_ctx->elfcorebuf);
+ mic_ctx->elfcorebuf = NULL;
+ return rc;
+ }
+
+ /* Merge all PT_NOTE headers into one. */
+ rc = merge_note_headers_elf32(mic_ctx, mic_ctx->elfcorebuf, &mic_ctx->elfcorebuf_sz, &mic_ctx->vmcore_list);
+ if (rc) {
+ kfree(mic_ctx->elfcorebuf);
+ mic_ctx->elfcorebuf = NULL;
+ return rc;
+ }
+ rc = process_ptload_program_headers_elf32(mic_ctx->elfcorebuf, mic_ctx->elfcorebuf_sz,
+ &mic_ctx->vmcore_list);
+ if (rc) {
+ kfree(mic_ctx->elfcorebuf);
+ mic_ctx->elfcorebuf = NULL;
+ return rc;
+ }
+ set_vmcore_list_offsets_elf32(mic_ctx->elfcorebuf, &mic_ctx->vmcore_list);
+ return 0;
+}
+
+static int parse_crash_elf_headers(mic_ctx_t *mic_ctx)
+{
+ unsigned char e_ident[EI_NIDENT];
+ u64 addr;
+ int rc=0;
+
+ addr = elfcorehdr_addr;
+ rc = read_from_oldmem(mic_ctx, e_ident, EI_NIDENT, &addr, 0);
+ if (rc < 0)
+ return rc;
+ if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) {
+ printk(KERN_WARNING "Warning: Core image elf header"
+ " not found\n");
+ return -EINVAL;
+ }
+
+ if (e_ident[EI_CLASS] == ELFCLASS64) {
+ rc = parse_crash_elf64_headers(mic_ctx);
+ if (rc)
+ return rc;
+
+ /* Determine vmcore size. */
+ mic_ctx->vmcore_size = get_vmcore_size_elf64(mic_ctx->elfcorebuf);
+ } else if (e_ident[EI_CLASS] == ELFCLASS32) {
+ rc = parse_crash_elf32_headers(mic_ctx);
+ if (rc)
+ return rc;
+
+ /* Determine vmcore size. */
+ mic_ctx->vmcore_size = get_vmcore_size_elf32(mic_ctx->elfcorebuf);
+ } else {
+ printk(KERN_WARNING "Warning: Core image elf header is not"
+ " sane\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* Init function for vmcore module. */
+int vmcore_create(mic_ctx_t *mic_ctx)
+{
+ int rc = 0;
+ char name[64];
+ if (!vmcore_dir) {
+ rc = -ENOMEM;
+ return rc;
+ }
+ INIT_LIST_HEAD(&mic_ctx->vmcore_list);
+ rc = parse_crash_elf_headers(mic_ctx);
+ if (rc) {
+ printk(KERN_WARNING "Kdump: vmcore not initialized\n");
+ if (mic_ctx->vmcore_dir) {
+ remove_proc_entry(name, vmcore_dir);
+ mic_ctx->vmcore_dir = NULL;
+ }
+ return rc;
+ }
+ snprintf(name, 64, "mic%d", mic_ctx->bi_id);
+ if (!mic_ctx->vmcore_dir) {
+ mic_ctx->vmcore_dir = proc_create_data(name, S_IRUSR,
+ vmcore_dir, &proc_vmcore_operations, mic_ctx);
+ if (!mic_ctx->vmcore_dir) {
+ printk(KERN_WARNING "Kdump: proc creation for %s failed\n", name);
+ rc = -ENOMEM;
+ return rc;
+ }
+ }
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+#else
+ if (mic_ctx->vmcore_dir)
+ mic_ctx->vmcore_dir->size = mic_ctx->vmcore_size;
+#endif
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+
+#define MIC_BOOT_PARAM_HEADER_VERSION 8
+
+#define MIC_OS_BOOTSTATUS_SUCCESS 1
+#define MIC_OS_BOOTSTATUS_BOOT_0 2 // Initial state of uOS boot
+#define MIC_OS_BOOTSTATUS_ERROR_VERSION_MISMATCH 3
+#define MIC_OS_BOOTSTATUS_ERROR 4
+
+#define MIC_HOST_DEFAULT 6 // Only value accepted so do not change
+
+#define MIC_ENG_APPLICATION 0
+#define MIC_ENG_PAGING 1
+#define MIC_ENG_VIDEO 2
+#define MIC_ENG_HIGHPRIORITY 3
+#define MIC_ENG_MAX_SUPPORTED_ENGINES 4
+
+struct ringbuf_memdesc
+{
+ uint64_t address; // Location of the ring buffer
+ uint32_t size; // size of ring buffer
+ uint32_t reserved; // pad
+};
+
+struct mic_bootparam
+{
+ uint64_t bp_version;
+
+ union
+ {
+ uint32_t bp_bootstatus;
+ uint64_t bp_reserved;
+ };
+
+ uint64_t bp_vcons_addr;
+ uint64_t bp_vcons_size;
+ uint64_t bp_shdata_addr;
+ uint64_t bp_shdata_size;
+ struct ringbuf_memdesc bp_ringbuf[MIC_ENG_MAX_SUPPORTED_ENGINES];
+
+ uint64_t bp_unused0;
+ uint64_t bp_unused1;
+ uint64_t bp_unused2;
+ uint64_t bp_unused3;
+ uint64_t bp_unused4;
+ uint64_t bp_unused5;
+ uint64_t bp_unused6;
+ uint64_t bp_unused7;
+
+ uint64_t bp_engstate_addr;
+
+ struct ringbuf_memdesc bp_unused8;
+
+ uint64_t bp_unused9;
+ uint64_t bp_unused10;
+ uint64_t bp_unused11;
+
+};
+
+struct host_bootparam
+{
+ uint64_t bp_version;
+
+ union
+ {
+ uint64_t bp_host_type;
+ uint64_t bp_reserved;
+ };
+
+ uint64_t bp_vcons_addr;
+ uint64_t bp_vcons_size;
+
+ uint64_t bp_unused0;
+
+ uint64_t bp_engstate_addr;
+
+ struct ringbuf_memdesc bp_ringbuf[MIC_ENG_MAX_SUPPORTED_ENGINES];
+
+ uint64_t bp_dmabuf_size[MIC_ENG_MAX_SUPPORTED_ENGINES];
+
+ uint64_t bp_unused1;
+ uint64_t bp_unused2;
+
+ uint64_t bp_aper_size;
+
+ uint8_t bp_unused3[36];
+ uint64_t bp_unused4;
+
+ struct ringbuf_memdesc bp_unused5;
+
+ uint64_t bp_unused6;
+ uint64_t bp_unused7;
+
+ uint32_t bp_watchdog_timeout;
+};
+
+struct enginestate_mic
+{
+ uint32_t writeOffset __attribute__((aligned(64)));
+ uint32_t lastCompletedFence __attribute__((aligned(64)));
+ uint32_t fenceWhenPreempted __attribute__((aligned(64)));
+ uint32_t preemptOffset __attribute__((aligned(64)));
+};
+
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef COMPL_BUF_RING_H
+#define COMPL_BUF_RING_H
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+#include "mic_dma_md.h"
+#ifndef _MIC_SCIF_
+#include "micscif.h"
+#include "micscif_smpt.h"
+#endif
+#define MAX_POLL_TAIL_READ_RETRIES 20
+
+/*
+ * Assuming read/write to int is atomic
+ * This can't be used as generic ring because of update_tail()
+ * One entry is left in the ring to differentiate between ring being empty and
+ * full
+ */
+struct compl_buf_ring {
+ int head;
+ int tail;
+ int size;
+ uint64_t tail_location;
+ dma_addr_t tail_phys;
+};
+
+/*
+ * FIXME:
+ * Function calls pci_map_single etc, return type needs to indicate
+ * an error
+ */
+static __always_inline void init_ring(struct compl_buf_ring *ring, int size,
+ int device_num)
+{
+#ifndef _MIC_SCIF_
+ struct pci_dev *pdev;
+#endif
+ ring->head = 0;
+ ring->tail = 0;
+ ring->size = size;
+ ring->tail_location = (uint64_t) kmalloc(sizeof(uint64_t), GFP_ATOMIC);
+ BUG_ON(!ring->tail_location);
+ *(int*)ring->tail_location = -1;
+#ifdef _MIC_SCIF_
+ ring->tail_phys = virt_to_phys((void*)ring->tail_location);
+#else
+ micscif_pci_dev(device_num, &pdev);
+
+ ring->tail_phys = mic_map_single(device_num - 1, pdev, (void *)ring->tail_location,
+ sizeof(uint64_t));
+ if (mic_map_error(ring->tail_phys))
+ printk(KERN_ERR "mic_map returned error please help\n");
+#endif
+}
+
+static __always_inline void uninit_ring(struct compl_buf_ring *ring,
+ int device_num)
+{
+#ifndef _MIC_SCIF_
+ struct pci_dev *pdev;
+#endif
+ ring->head = 0;
+ ring->tail = 0;
+ ring->size = 0;
+#ifndef _MIC_SCIF_
+ micscif_pci_dev(device_num, &pdev);
+ mic_unmap_single(device_num - 1, pdev, ring->tail_phys, sizeof(uint64_t));
+#endif
+ kfree((void *)ring->tail_location);
+}
+
+static __always_inline int incr_rb_index(int cur_index, int ring_size)
+{
+ return((cur_index + 1) % ring_size);
+}
+
+/*
+ * Tail location has the index that has been recently processed by dma engine
+ * But, tail has to point to the index that will be processed next
+ * So increment the tail
+ */
+static __always_inline void update_tail(struct compl_buf_ring *ring, int new_tail)
+{
+ ring->tail = new_tail;
+}
+
+static __always_inline int read_tail(struct compl_buf_ring *ring)
+{
+ return incr_rb_index(*(volatile int*)ring->tail_location, ring->size);
+}
+
+/*
+ * This fn. assumes no one else is updating head
+ * Returns - avaliable space
+ * 0 - if no space is available
+ */
+static __always_inline bool avail_space_in_ring(struct compl_buf_ring *ring)
+{
+ int count = 0, max_num_retries = MAX_POLL_TAIL_READ_RETRIES, num_retries = 0;
+ int head = ring->head, tail = ring->tail;
+retry:
+ if (head > tail)
+ count = (tail - 0) + (ring->size - head);
+ else if (tail > head)
+ count = tail - head;
+ else
+ return ring->size - 1;
+
+ if (1 != count)
+ return count - 1;
+
+ num_retries++;
+ if (num_retries == max_num_retries)
+ return 0;
+ cpu_relax();
+
+ ring->tail = read_tail(ring);
+ tail = ring->tail;
+
+ goto retry;
+}
+
+/*
+ * Used for polling
+ */
+static __always_inline bool is_entry_processed(struct compl_buf_ring *ring, int index)
+{
+ int head = ring->head, tail = ring->tail;
+ if (head < tail) {
+ if (index >= head && index < tail)
+ return 1;
+ } else {
+ if (index >= head || index < tail)
+ return 1;
+ }
+ return 0;
+}
+
+static __always_inline void incr_head(struct compl_buf_ring *ring)
+{
+ ring->head = incr_rb_index(ring->head, ring->size);
+}
+
+/*
+ * This function is not reentrant
+ * It is expected that the user of this func, will call incr_head() if allocated
+ * buffer is used
+ */
+static __always_inline int allocate_buffer(struct compl_buf_ring *ring)
+{
+ if (avail_space_in_ring(ring))
+ return ring->head;
+ else
+ return -1;
+}
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/* Contains common definitions for Windows and Linux IO Interface */
+
+#ifndef __IO_INTERFACE_H__
+#define __IO_INTERFACE_H__
+
+/*
+ * The host driver exports sysfs entries in
+ * /sys/class/mic/micX/
+ * The "/sys/class/mic/micX/state" entry reflects the state of the
+ * card as it transitions from hardware reset through booting an image
+ *
+ * All the other entries have valid values when the state entry is either
+ * "ready" or "online"
+ */
+
+/*
+ * -----------------------------------------
+ * IOCTL interface information
+ * -----------------------------------------
+ */
+
+#define IOCTL_FLASHCMD _IOWR('c', 5, struct ctrlioctl_flashcmd *)
+#define IOCTL_CARDMEMCPY _IOWR('c', 8, struct ctrlioctl_cardmemcpy *)
+
+typedef enum _product_knc_stepping_t
+{
+ KNC_A_STEP,
+ KNC_B0_STEP,
+ KNC_C_STEP,
+ KNC_B1_STEP
+} product_knc_stepping_t;
+
+typedef enum {
+ FLASH_CMD_ABORT,
+ FLASH_CMD_READ,
+ FLASH_CMD_WRITE,
+ FLASH_CMD_VERSION,
+ RAS_CMD,
+ RAS_CMD_INJECT_REPAIR,
+ RAS_CMD_CORE_DISABLE,
+ RAS_CMD_CORE_ENABLE,
+ RAS_CMD_ECC_DISABLE = 0xD,
+ RAS_CMD_ECC_ENABLE = 0xE,
+ RAS_CMD_EXIT = 0xF,
+ /* Driver only commands that are not passed to RASMM */
+ FLASH_CMD_READ_DATA,
+ FLASH_CMD_STATUS,
+} MIC_FLASH_CMD_TYPE;
+
+/**
+ * struct ctrlioctl_flashcmd:
+ *
+ * \param brdnum board for which IOCLT is requested
+ * \param type arguments needed for the uos escape call
+ * \param data size of escape arguments
+ * \param len uos escape opecode
+ *
+ * This structure is used for IOCTL_FLASHCMD.
+ *
+ * This IOCTL can only be issued when /sys/class/mic/mic0/state returns "online"
+ * after it has been set to "boot:flash"
+ */
+struct ctrlioctl_flashcmd {
+ uint32_t brdnum;
+ MIC_FLASH_CMD_TYPE type;
+ void *data;
+ uint32_t len;
+};
+
+
+/*
+ * IN/OUT structure used by MIC_FLASH_CMD_TYPE FLASH_CMD_VERSION
+ * This structure is passed in as data in above command
+ */
+#define MAX_FLASH_VER_STRLEN 16
+struct version_struct {
+ uint16_t hdr_ver;
+ uint16_t odm_ver;//revision for ODM change for flash
+ uint64_t upd_time_bcd;
+ uint8_t upd_ver[MAX_FLASH_VER_STRLEN]; // 16 bytes for flash version
+ uint64_t mfg_time_bcd;
+ uint8_t mfg_ver[MAX_FLASH_VER_STRLEN]; // 16 bytes for flash version
+};
+
+/*
+ * status values returned in MIC_FLASH_CMD_TYPE FLASH_CMD_STATUS
+ */
+typedef enum {
+ FLASH_IDLE,
+ FLASH_CMD_IN_PROGRESS,
+ FLASH_CMD_COMPLETED,
+ FLASH_CMD_FAILED,
+ FLASH_CMD_AUTH_FAILED,
+ FLASH_SMC_CMD_IN_PROGRESS,
+ FLASH_SMC_CMD_COMPLETE,
+ FLASH_SMC_CMD_FAILED,
+ FLASH_SMC_CMD_AUTH_FAILED,
+ FLASH_CMD_INVALID = 0xF,
+} MIC_FLASH_STATUS;
+
+struct flash_stat {
+ MIC_FLASH_STATUS status;
+ uint32_t percent;
+ uint32_t smc_status;
+ uint32_t cmd_data;
+ uint32_t mm_debug;
+};
+
+typedef enum {
+ DBOX,
+ SBOX,
+} MMIO_REGISTER_TYPE;
+
+/**
+ * struct ctrlioctl_cardmemcpy:
+ *
+ * \param brdnum board for which IOCLT is requested
+ * \param start card side physical address from which the copy will start
+ * \param size offset of the register from data is to be read
+ * \param dest user buffer in which data is to be copied
+ *
+ * This structure is used for IOCTL_MMIOREAD.
+ */
+struct ctrlioctl_cardmemcpy {
+ uint32_t brdnum;
+ uint64_t start;
+ uint64_t size;
+ void *dest;
+};
+
+/*
+ * FIXME:: All the typedefines and structures below and their references need
+ * to be cleaned up from the driver code
+ *---------------------------------------------------------------------------
+ */
+
+typedef enum _product_family_t
+{
+ FAMILY_UNKNOWN = 0,
+ FAMILY_ABR,
+ FAMILY_KNC
+} product_family_t;
+
+typedef enum {
+ USAGE_MODE_NORMAL = 0,
+ USAGE_MODE_MAINTENANCE,
+ USAGE_MODE_ZOMBIE,
+ USAGE_MODE_MEMDIAG,
+ USAGE_MODE_NORMAL_RESTRICTED,
+ USAGE_MODE_NOP,
+ USAGE_MODE_MAX,
+
+} CARD_USAGE_MODE;
+
+/*
+ * SBOX register definitions
+ * TODO: Remove the bit fields and replace them with bitwise operators
+ */
+typedef union sbox_scratch1_reg {
+ uint32_t value;
+ struct {
+ uint32_t percent : 7;
+ uint32_t status : 4;
+ uint32_t command : 4;
+ uint32_t smc_status : 4;
+ uint32_t reserved : 5;
+ uint32_t cmd_data : 7;
+ uint32_t mm_debug : 1;
+ } bits;
+} sbox_scratch1_reg_t;
+
+typedef union sbox_scratch2_reg {
+ uint32_t value;
+ struct {
+ uint32_t bootstrap_ready : 1;
+ uint32_t bsp_apic_id : 9;
+ uint32_t reserved : 2;
+ uint32_t image_addr : 20;
+ } bits;
+} sbox_scratch2_reg_t;
+
+#endif //!__IO_INTERFACE_H__
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MIC_DMA_API_H
+#define MIC_DMA_API_H
+
+struct dma_channel;
+/* API exported by the DMA library */
+
+/*
+ * Per MIC device (per MIC card) DMA handle. The card opens the handle to its own device.
+ * The host opens the handle to the DMA devices of one of the cards.
+ */
+typedef void * mic_dma_handle_t;
+
+/* DMA Library Init/Uninit Routines */
+int open_dma_device(int device_num, uint8_t *mmio_va_base, mic_dma_handle_t* dma_handle);
+
+void close_dma_device(int device_num, mic_dma_handle_t *dma_handle);
+
+/*
+ * reserve_dma_channel - reserve a given dma channel for exclusive use
+ *
+ * @dma_handle - handle to DMA device returned by open_dma_device
+ * @chan_num - Channel number to be reserved
+ * @chan - set to point to the dma channel reserved by the call
+ *
+ * Returns < 1 on error (errorno)
+ * Returns 0 on success
+ */
+int reserve_dma_channel(mic_dma_handle_t dma_handle, int chan_num, struct dma_channel **chan);
+
+/*
+ * allocate_dma_channel - dynamically allocate a dma channel (for a short while). Will
+ * search for, choose, and lock down one channel for use by the calling thread.
+ *
+ * @dma_handle - handle to DMA device returned by open_dma_device
+ * @chan - Returns the dma_channel pointer that was allocated by the call
+ *
+ * Returns < 1 on error
+ * Returns 0 on success
+ *
+ * NOTE: This function grabs a lock before exiting -- the calling thread MUST NOT
+ * sleep, and must call free_dma_channel before returning to user-space or switching
+ * volantarily to another thread. Similarly, this function cannot be called from
+ * an interrupt context at this time.
+ */
+int allocate_dma_channel(mic_dma_handle_t dma_handle, struct dma_channel **chan);
+
+/*
+ * request_dma_channel - Request a specific DMA channel.
+ *
+ * @dma_handle - handle to DMA device returned by open_dma_device
+ * @chan - Returns the dma_channel pointer that was requested
+ *
+ * Returns: 0 on success and -ERESTARTSYS if the wait was interrupted
+ * or -EBUSY if the channel was not available.
+ *
+ * NOTE: This function grabs a lock before exiting -- the calling thread MUST NOT
+ * sleep, and must call free_dma_channel before returning to user-space or switching
+ * volantarily to another thread. Similarly, this function cannot be called from
+ * an interrupt context at this time.
+ */
+int request_dma_channel(struct dma_channel *chan);
+
+/*
+ * free_dma_channel - after allocating a channel, used to
+ * free the channel after DMAs are submitted
+ *
+ * @chan - pointer to the dma_channel struct that was allocated
+ *
+ * Returns 0 on success, < 1 on error (errorno)
+ *
+ * NOTE: This function must be called after all do_dma calls are finished,
+ * but can be called before the DMAs actually complete (as long as the comp_cb()
+ * handler in do_dma don't refer to the dma_channel struct). If called with a
+ * dynamically allocated dma_channel, the caller must be the thread that called
+ * allocate_dma_channel. When operating on a dynamic channel, free unlocks the
+ * mutex locked in allocate. Statically allocated channels cannot be freed,
+ * and calling this function with that type of channel will return an error.
+ */
+int free_dma_channel(struct dma_channel *chan);
+
+/*
+ * drain_dma_poll - Drain all outstanding DMA operations for a particular
+ * DMA channel via polling.
+ * @chan - DMA channel
+ * Return 0 on success and -errno on error.
+ */
+int drain_dma_poll(struct dma_channel *chan);
+
+/*
+ * drain_dma_intr - Drain all outstanding DMA operations for a particular
+ * DMA channel via interrupt based blocking wait.
+ * @chan - DMA channel
+ * Return 0 on success and -errno on error.
+ */
+int drain_dma_intr(struct dma_channel *chan);
+
+/*
+ * drain_dma_global - Drain all outstanding DMA operations for
+ * all online DMA channel.
+ * @block - Is it okay to block while operations are drained?
+ * Return 0 on success and -errno on error.
+ */
+int drain_dma_global(mic_dma_handle_t dma_handle);
+
+#ifdef _MIC_SCIF_
+/*
+ * dma_suspend: DMA tasks before transition to low power state.
+ * @dma_handle: Handle for a DMA driver context.
+ */
+void dma_suspend(mic_dma_handle_t dma_handle);
+
+/*
+ * dma_resume: DMA tasks after wake up from low power state.
+ * @dma_handle: Handle for a DMA driver context.
+ */
+void dma_resume(mic_dma_handle_t dma_handle);
+#else
+/*
+ * dma_prep_suspend: DMA tasks required on host before a device can transition
+ * to a low power state.
+ * @dma_handle: Handle for a DMA driver context.
+ */
+void dma_prep_suspend(mic_dma_handle_t dma_handle);
+#endif
+
+static inline void mic_dma_thread_free_chan(struct dma_channel *chan)
+{
+ free_dma_channel(chan);
+}
+#ifndef _MIC_SCIF_
+//extern struct mutex lock_dma_dev_init;
+void host_dma_interrupt_handler(mic_dma_handle_t dma_handle, uint32_t sboxSicr0Reg);
+#endif
+
+#endif /* MIC_DMA_API_H */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MIC_DMA_LIB_H
+#define MIC_DMA_LIB_H
+
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+
+/* Program SUD for poll ring */
+#define DO_DMA_POLLING (1<<0)
+/* Program SUD for interrupt ring */
+#define DO_DMA_INTR (1<<1)
+
+struct dma_channel;
+
+struct dma_completion_cb {
+ void (*dma_completion_func) (uint64_t cookie);
+ uint64_t cb_cookie;
+ uint8_t *temp_buf;
+ uint8_t *temp_buf_to_free;
+ bool is_cache;
+ uint64_t dst_offset;
+ uint64_t tmp_offset;
+ struct reg_range_t *dst_window;
+ size_t len;
+ dma_addr_t temp_phys;
+ int remote_node;
+ int header_padding;
+};
+
+int get_chan_num(struct dma_channel *chan);
+/*
+ * do_dma - main dma function: perform a dma memcpy, len bytes from src to dst
+ *
+ * @chan - DMA channel to use for the transfer. The channel can be allocated
+ * dynamically by calling allocate_dma_channel, or statically by
+ * reserve_dma_channel. Using a channel not allocated in this way will
+ * result in undefined behavior.
+ * @flags - ATOMIC, called from an interrupt context (no blocking)
+ * @src - src physical address
+ * @dst - dst physical address
+ * @len - Length of the dma
+ * @comp_cb - When the DMA is complete, the struct's function will be called. NOTE!
+ * comp_cb(cb_cookie) is called from an interrupt context, so the
+ * function must not sleep or block.
+ *
+ * Return < 1 on error
+ * Return 0 on success and DMA is completed
+ * Return > 1: DMA has been queued. Return value can be polled on for completion
+ * (poll cookie). An example (simplified w/ no error handling).
+ * int cookie = do_dma(...);
+ * while (poll_dma_completion(cookie) == 0);
+ * printf("DMA now complete\n");
+ */
+int do_dma(struct dma_channel *chan, int flags,
+ uint64_t src, uint64_t dst, size_t len,
+ struct dma_completion_cb *comp_cb);
+/*
+ * poll_dma_completion - check if a DMA is complete
+ *
+ * @poll_cookie - value returned from do_dma
+ *
+ * Returns
+ * < 0 -> error (e.g., invalid cookie)
+ * 0 -> DMA pending
+ * 1 -> DMA completed
+ *
+ * Note: This is mostly useful after calling do_dma with a NULL comp_cb parameter, as
+ * it will allow the caller to wait for DMA completion.
+ */
+int poll_dma_completion(int poll_cookie, struct dma_channel *chan);
+
+/*
+ * do_status_update: Update physical address location with the value provided.
+ * Ensures all previous DMA descriptors submitted on this DMA
+ * channel are executed.
+ * @chan - DMA channel to use for the transfer. The channel can be allocated
+ * dynamically by calling allocate_dma_channel, or statically by
+ * reserve_dma_channel. Using a channel not allocated in this way will
+ * result in undefined behavior.
+ * @phys - physical address
+ * @value - Value to be programmed
+ *
+ * Return 0 on success and appropriate error value on error.
+ */
+int do_status_update(struct dma_channel *chan, uint64_t phys, uint64_t value);
+
+/*
+ * get_dma_mark: Obtain current value of DMA mark
+ * @chan - DMA channel to use for the transfer. The channel can be allocated
+ * dynamically by calling allocate_dma_channel, or statically by
+ * reserve_dma_channel. Using a channel not allocated in this way will
+ * result in undefined behavior.
+ *
+ * Return mark.
+ */
+int get_dma_mark(struct dma_channel *chan);
+
+/*
+ * is_current_dma_mark: Check if the dma mark provided is the current DMA mark.
+ * @chan - DMA channel
+ * @mark - DMA mark
+ *
+ * Return true on success and false on failure.
+ */
+bool is_current_dma_mark(struct dma_channel *chan, int mark);
+
+/*
+ * program_dma_mark: Increment the current value of the DMA mark for a DMA channel
+ * and program an interrupt status update descriptor which ensures that all DMA
+ * descriptors programmed until this point in time are completed.
+ * @chan - DMA channel to use for the transfer. The channel can be allocated
+ * dynamically by calling allocate_dma_channel, or statically by
+ * reserve_dma_channel. Using a channel not allocated in this way will
+ * result in undefined behavior.
+ *
+ * Return mark upon success and appropriate negative error value on error.
+ */
+int program_dma_mark(struct dma_channel *chan);
+
+/*
+ * is_dma_mark_wait: Check if the dma mark provided has been processed.
+ * @chan - DMA channel
+ * @mark - DMA mark
+ *
+ * Return true on success and false on failure.
+ */
+bool is_dma_mark_processed(struct dma_channel *chan, int mark);
+
+/*
+ * dma_mark_wait: Wait for the dma mark to complete.
+ * @chan - DMA channel
+ * @mark - DMA mark
+ * @is_interruptible - Use wait_event_interruptible() or not.
+ *
+ * Return 0 on success and appropriate error value on error.
+ */
+int dma_mark_wait(struct dma_channel *chan, int mark, bool is_interruptible);
+
+#ifndef _MIC_SCIF_
+void host_dma_lib_interrupt_handler(struct dma_channel *chan);
+#endif
+
+#endif /* MIC_DMA_LIB_H */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MIC_DMA_MD_H
+#define MIC_DMA_MD_H
+
+#include "mic_sbox_md.h"
+#include "micsboxdefine.h"
+
+#define MAX_NUM_DMA_CHAN 8
+/*
+ * WE ASSUME 0 to __LAST_HOST_CHAN_NUM are owned by host
+ * Keep this in mind when changing this value
+ */
+#define __LAST_HOST_CHAN_NUM 3
+
+#ifdef _MIC_SCIF_
+static inline int first_dma_chan(void)
+{
+ return __LAST_HOST_CHAN_NUM + 1;
+}
+
+static inline int last_dma_chan(void)
+{
+ return MAX_NUM_DMA_CHAN - 1;
+}
+#else
+static inline int first_dma_chan(void)
+{
+ return 0;
+}
+
+static inline int last_dma_chan(void)
+{
+ return __LAST_HOST_CHAN_NUM;
+}
+#endif
+enum md_mic_dma_chan_reg {
+ REG_DCAR = 0,
+ REG_DHPR,
+ REG_DTPR,
+ REG_DAUX_HI,
+ REG_DAUX_LO,
+ REG_DRAR_HI,
+ REG_DRAR_LO,
+ REG_DITR,
+ REG_DSTAT,
+ REG_DSTATWB_LO,
+ REG_DSTATWB_HI,
+ REG_DCHERR,
+ REG_DCHERRMSK,
+};
+
+
+/* Pre-defined L1_CACHE_SHIFT is 6 on RH and 7 on Suse */
+#undef L1_CACHE_SHIFT
+#define L1_CACHE_SHIFT 6
+#undef L1_CACHE_BYTES
+#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
+
+enum dma_chan_flags {
+ CHAN_AVAILABLE = 2,
+ CHAN_INUSE = 3
+};
+
+/* Maximum DMA transfer size for a single memory copy descriptor */
+#define MIC_MAX_DMA_XFER_SIZE (((1U) * 1024 * 1024) - L1_CACHE_BYTES)
+
+/* TODO:
+ * I think it should be 128K - 64 (even 128k - 4 may work).
+ * SIVA: Check this in the end
+ */
+/*
+ * The maximum number of descriptors in the DMA descriptor queue is
+ * 128K - 1 but since it needs to be a multiple of cache lines it is 128K - 64
+ */
+#define MIC_MAX_NUM_DESC_PER_RING ((128 * 1024) - L1_CACHE_BYTES)
+
+/**
+ * enum md_mic_dma_chan_owner - Memory copy DMA channels can be Host or MIC owned.
+ * AES channel can only be MIC owned.
+ */
+enum md_mic_dma_chan_owner {
+ MIC_DMA_CHAN_MIC_OWNED = 0,
+ MIC_DMA_CHAN_HOST_OWNED
+};
+
+/**
+ * enum md_mic_dma_aes_endianness - Endianness needs to be provided
+ * only for the AES channel
+ */
+enum md_mic_dma_aes_endianness {
+ /*
+ * The following two bits are opposite of what is given in
+ * content protection HAS but this is how it is implemented in RTL.
+ */
+ MIC_BIG_ENDIAN = 0,
+ MIC_LITTLE_ENDIAN
+};
+
+
+/**
+ * struct md_mic_dma_chan - Opaque data structure for DMA channel specific fields.
+ */
+/*
+ * struct md_mic_dma_chan: DMA channel specific structure
+ * @in_use - true if the channel is in use and false otherwise
+ * @owner - host or MIC required for masking/unmasking
+ * interrupts and enabling channels
+ * @endianness - required for enabling AES channel
+ * @cookie - Debug cookie to identify this structure
+ * @num_desc_in_ring - Number of descriptors in the descriptor
+ * ring for this channel.
+ */
+struct md_mic_dma_chan {
+ int ch_num;
+ atomic_t in_use;
+ enum md_mic_dma_chan_owner owner;
+ enum md_mic_dma_aes_endianness endianness;
+ int cookie;
+ uint32_t num_desc_in_ring;
+ uint32_t cached_tail;
+ uint32_t completion_count;
+ void *dstat_wb_loc;
+ dma_addr_t dstat_wb_phys;
+ /* Add debug/profiling stats here */
+};
+
+
+/*
+ * struct mic_dma_device - MIC DMA Device specific structure
+ * @chan_info - static array of MIC DMA channel specific structures
+ * @lock - MTX_DEF lock to synchronize allocation/deallocation of DMA channels
+ */
+struct mic_dma_device {
+ struct md_mic_dma_chan chan_info[MAX_NUM_DMA_CHAN];
+ void *mm_sbox;
+};
+
+
+/**
+ * union md_mic_dma_desc - Opaque data structure for DMA descriptor format.
+ */
+/* TODO: Change bitfields to portable masks */
+union md_mic_dma_desc {
+ union {
+ struct {
+ uint64_t rsvd0;
+ uint64_t rsvd1:60;
+ uint64_t type:4;
+ } nop;
+ struct {
+ uint64_t sap:40;
+ uint64_t index:3;
+ uint64_t rsvd0:3;
+ uint64_t length:14;
+ uint64_t rsvd1:4;
+ uint64_t dap:40;
+ uint64_t resd:15;
+ uint64_t twb:1;
+ uint64_t intr:1;
+ uint64_t c:1;
+ uint64_t co:1;
+ uint64_t ecy:1;
+ uint64_t type:4;
+ } memcopy;
+ struct {
+ uint64_t data;
+ uint64_t dap:40;
+ uint64_t rsvdr0:19;
+ uint64_t intr:1;
+ uint64_t type:4;
+ } status;
+ struct {
+ uint64_t data:32;
+ uint64_t rsvd0:32;
+ uint64_t dap:40;
+ uint64_t rsvd1:20;
+ uint64_t type:4;
+ } general;
+ struct {
+ uint64_t data;
+ uint64_t rsvd0:53;
+ uint64_t cs:1;
+ uint64_t index:3;
+ uint64_t h:1;
+ uint64_t sel:2;
+ uint64_t type:4;
+ } keynoncecnt;
+ struct {
+ uint64_t skap:40;
+ uint64_t ski:3;
+ uint64_t rsvd0:21;
+ uint64_t rsvd1:51;
+ uint64_t di:3;
+ uint64_t rsvd2:6;
+ uint64_t type:4;
+ } key;
+ } desc;
+ struct {
+ uint64_t qw0;
+ uint64_t qw1;
+ } qwords;
+};
+
+/* Initialization functions */
+void md_mic_dma_init(struct mic_dma_device *dma_dev, uint8_t *mmio_va_base);
+void md_mic_dma_uninit(struct mic_dma_device *dma_dev);
+void md_mic_dma_chan_init_attr(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan);
+void md_mic_dma_chan_mask_intr(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan);
+void md_mic_dma_chan_unmask_intr(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan);
+void md_mic_dma_chan_set_desc_ring(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan,
+ phys_addr_t desc_ring_phys_addr,
+ uint32_t num_desc);
+void md_mic_dma_enable_chan(struct mic_dma_device *dma_dev, uint32_t chan_num, bool enable);
+/* API */
+struct md_mic_dma_chan *md_mic_dma_request_chan(struct mic_dma_device *dma_dev,
+ enum md_mic_dma_chan_owner owner);
+void md_mic_dma_free_chan(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan);
+
+static uint32_t mic_dma_reg[8][13] = {
+ {SBOX_DCAR_0, SBOX_DHPR_0, SBOX_DTPR_0, SBOX_DAUX_HI_0, SBOX_DAUX_LO_0, SBOX_DRAR_HI_0,
+ SBOX_DRAR_LO_0, SBOX_DITR_0, SBOX_DSTAT_0,
+ SBOX_DSTATWB_LO_0, SBOX_DSTATWB_HI_0, SBOX_DCHERR_0, SBOX_DCHERRMSK_0},
+ {SBOX_DCAR_1, SBOX_DHPR_1, SBOX_DTPR_1, SBOX_DAUX_HI_1, SBOX_DAUX_LO_1, SBOX_DRAR_HI_1,
+ SBOX_DRAR_LO_1, SBOX_DITR_1, SBOX_DSTAT_1,
+ SBOX_DSTATWB_LO_1, SBOX_DSTATWB_HI_1, SBOX_DCHERR_1, SBOX_DCHERRMSK_1},
+ {SBOX_DCAR_2, SBOX_DHPR_2, SBOX_DTPR_2, SBOX_DAUX_HI_2, SBOX_DAUX_LO_2, SBOX_DRAR_HI_2,
+ SBOX_DRAR_LO_2, SBOX_DITR_2, SBOX_DSTAT_2,
+ SBOX_DSTATWB_LO_2, SBOX_DSTATWB_HI_2, SBOX_DCHERR_2, SBOX_DCHERRMSK_2},
+ {SBOX_DCAR_3, SBOX_DHPR_3, SBOX_DTPR_3, SBOX_DAUX_HI_3, SBOX_DAUX_LO_3, SBOX_DRAR_HI_3,
+ SBOX_DRAR_LO_3, SBOX_DITR_3, SBOX_DSTAT_3,
+ SBOX_DSTATWB_LO_3, SBOX_DSTATWB_HI_3, SBOX_DCHERR_3, SBOX_DCHERRMSK_3},
+ {SBOX_DCAR_4, SBOX_DHPR_4, SBOX_DTPR_4, SBOX_DAUX_HI_4, SBOX_DAUX_LO_4, SBOX_DRAR_HI_4,
+ SBOX_DRAR_LO_4, SBOX_DITR_4, SBOX_DSTAT_4,
+ SBOX_DSTATWB_LO_4, SBOX_DSTATWB_HI_4, SBOX_DCHERR_4, SBOX_DCHERRMSK_4},
+ {SBOX_DCAR_5, SBOX_DHPR_5, SBOX_DTPR_5, SBOX_DAUX_HI_5, SBOX_DAUX_LO_5, SBOX_DRAR_HI_5,
+ SBOX_DRAR_LO_5, SBOX_DITR_5, SBOX_DSTAT_5,
+ SBOX_DSTATWB_LO_5, SBOX_DSTATWB_HI_5, SBOX_DCHERR_5, SBOX_DCHERRMSK_5},
+ {SBOX_DCAR_6, SBOX_DHPR_6, SBOX_DTPR_6, SBOX_DAUX_HI_6, SBOX_DAUX_LO_6, SBOX_DRAR_HI_6,
+ SBOX_DRAR_LO_6, SBOX_DITR_6, SBOX_DSTAT_6,
+ SBOX_DSTATWB_LO_6, SBOX_DSTATWB_HI_6, SBOX_DCHERR_6, SBOX_DCHERRMSK_6},
+ {SBOX_DCAR_7, SBOX_DHPR_7, SBOX_DTPR_7, SBOX_DAUX_HI_7, SBOX_DAUX_LO_7, SBOX_DRAR_HI_7,
+ SBOX_DRAR_LO_7, SBOX_DITR_7, SBOX_DSTAT_7,
+ SBOX_DSTATWB_LO_7, SBOX_DSTATWB_HI_7, SBOX_DCHERR_7, SBOX_DCHERRMSK_7}
+};
+
+static __always_inline uint32_t
+md_mic_dma_read_mmio(struct mic_dma_device *dma_dev,
+ int chan, enum md_mic_dma_chan_reg reg)
+{
+ return mic_sbox_read_mmio(dma_dev->mm_sbox, mic_dma_reg[chan][reg]);
+}
+
+static __always_inline void
+md_mic_dma_write_mmio(struct mic_dma_device *dma_dev, int chan,
+ enum md_mic_dma_chan_reg reg, uint32_t value)
+{
+ mic_sbox_write_mmio(dma_dev->mm_sbox, mic_dma_reg[chan][reg], value);
+}
+
+#ifdef DEBUG
+#ifndef KASSERT
+#define KASSERT(x, y, ...) \
+ do { \
+ if(!x) \
+ printk(y, ##__VA_ARGS__);\
+ BUG_ON(!x); \
+ } while(0)
+#endif
+#define CHECK_CHAN(chan) \
+ do { \
+ KASSERT((chan), "NULL DMA channel\n"); \
+ KASSERT((DMA_CHAN_COOKIE == chan->cookie), \
+ "Bad DMA channel cookie 0x%x\n", chan->cookie); \
+ KASSERT(atomic_read(&(chan->in_use)), "DMA Channel not in use\n"); \
+ } while(0)
+#else // DEBUG
+#ifndef KASSERT
+#define KASSERT(x, y, ...) \
+ do { \
+ if(!x) \
+ printk(y, ##__VA_ARGS__);\
+ BUG_ON(!x); \
+ } while(0)
+#endif
+#define CHECK_CHAN(chan)
+
+#endif // DEBUG
+
+struct mic_dma_ctx_t;
+void md_mic_dma_chan_set_dstat_wb(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan);
+
+void md_mic_dma_chan_set_dcherr_msk(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan, uint32_t mask);
+
+static __always_inline void
+md_mic_dma_chan_write_head(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan, uint32_t head)
+{
+ uint32_t chan_num;
+ CHECK_CHAN(chan);
+ chan_num = chan->ch_num;
+ KASSERT((head < chan->num_desc_in_ring),
+ "head 0x%x > num_desc_in_ring 0x%x chan_num %d\n",
+ head, chan->num_desc_in_ring, chan_num);
+ md_mic_dma_write_mmio(dma_dev, chan_num, REG_DHPR, head);
+}
+
+uint32_t md_mic_dma_chan_read_head(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan);
+uint32_t md_mic_dma_chan_read_tail(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan);
+
+#define TAIL_PTR_READ_RETRIES 500000
+#define HW_CMP_CNT_MASK 0x1ffff
+static __always_inline uint32_t
+md_avail_desc_ring_space(struct mic_dma_device *dma_dev, bool is_astep,
+ struct md_mic_dma_chan *chan, uint32_t head, uint32_t required)
+{
+ uint32_t count = 0, max_num_retries = TAIL_PTR_READ_RETRIES, num_retries = 0;
+ uint32_t tail = chan->cached_tail;
+retry:
+ if (head > tail)
+ count = (tail - 0) + (chan->num_desc_in_ring - head);
+ else if (tail > head)
+ count = tail - head;
+ else
+ return (chan->num_desc_in_ring - 1);
+
+ if (count > required) {
+ return count - 1;
+ } else {
+ if (is_astep)
+ tail = md_mic_dma_chan_read_tail(dma_dev, chan);
+ else
+ tail = HW_CMP_CNT_MASK & md_mic_dma_read_mmio(dma_dev, chan->ch_num, REG_DSTAT);
+ }
+ chan->cached_tail = tail;
+ num_retries++;
+ if (num_retries == max_num_retries)
+ return 0;
+ cpu_relax();
+ goto retry;
+}
+
+bool md_mic_dma_chan_intr_pending(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan);
+phys_addr_t md_mic_dma_chan_get_desc_ring_phys(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan);
+phys_addr_t md_mic_dma_chan_get_dstatwb_phys(struct mic_dma_device *dma_dev,
+ struct md_mic_dma_chan *chan);
+inline uint32_t md_mic_dma_read_mmio(struct mic_dma_device *dma_dev,
+ int chan, enum md_mic_dma_chan_reg reg);
+
+/* Descriptor programming helpers */
+void md_mic_dma_prep_nop_desc(union md_mic_dma_desc *desc);
+
+/**
+ * md_mic_dma_memcpy_desc - Prepares a memory copy descriptor
+ * @src_phys: Source Physical Address must be cache line aligned
+ * @dst_phys: Destination physical address must be cache line aligned
+ * @size: Size of the transfer should not be 0 and must be a multiple
+ * of cache line size
+ */
+static __always_inline void
+md_mic_dma_memcpy_desc(union md_mic_dma_desc *desc,
+ uint64_t src_phys,
+ uint64_t dst_phys,
+ uint64_t size)
+{
+ KASSERT((desc != 0), ("NULL desc"));
+ KASSERT((ALIGN(src_phys - (L1_CACHE_BYTES - 1), L1_CACHE_BYTES) == src_phys),
+ "src not cache line aligned 0x%llx\n", (unsigned long long)src_phys);
+ KASSERT((ALIGN(dst_phys - (L1_CACHE_BYTES - 1), L1_CACHE_BYTES) == dst_phys),
+ "dst not cache line aligned 0x%llx\n", (unsigned long long)dst_phys);
+ KASSERT(((size != 0) && (size <= MIC_MAX_DMA_XFER_SIZE) &&
+ (ALIGN(size - (L1_CACHE_BYTES - 1), L1_CACHE_BYTES) == size)),
+ "size > MAX_DMA_XFER_SIZE size 0x%llx", (unsigned long long)size);
+
+ desc->qwords.qw0 = 0;
+ desc->qwords.qw1 = 0;
+ desc->desc.memcopy.type = 1;
+ desc->desc.memcopy.sap = src_phys;
+ desc->desc.memcopy.dap = dst_phys;
+ desc->desc.memcopy.length = (size >> L1_CACHE_SHIFT);
+}
+
+/**
+ * md_mic_dma_prep_status_desc - Prepares a status descriptor
+ * @data - Value to be updated by the DMA engine @ dst_phys
+ * @dst_phys: Destination physical address
+ * @generate_intr: Interrupt must be generated when the DMA HW
+ * completes processing this descriptor
+ */
+static __always_inline void
+md_mic_dma_prep_status_desc(union md_mic_dma_desc *desc, uint64_t data,
+ uint64_t dst_phys, bool generate_intr)
+{
+ KASSERT((desc != 0), ("NULL desc"));
+
+ desc->qwords.qw0 = 0;
+ desc->qwords.qw1 = 0;
+ desc->desc.memcopy.type = 2;
+ desc->desc.status.data = data;
+ desc->desc.status.dap = dst_phys;
+ if (generate_intr)
+ desc->desc.status.intr = 1;
+}
+
+/**
+ * md_mic_dma_prep_gp_desc - Prepares a general purpose descriptor
+ * @data - Value to be updated by the DMA engine @ dst_phys
+ * @dst_phys: Destination physical address
+ */
+static __always_inline void
+md_mic_dma_prep_gp_desc(union md_mic_dma_desc *desc, uint32_t data, uint64_t dst_phys)
+{
+ KASSERT((desc != 0), ("NULL desc"));
+
+ desc->qwords.qw0 = 0;
+ desc->qwords.qw1 = 0;
+ desc->desc.general.type = 3;
+ desc->desc.general.data = data;
+ desc->desc.general.dap = dst_phys;
+}
+/* Debug functions */
+void md_mic_dma_print_debug(struct mic_dma_device *dma_dev, struct md_mic_dma_chan *chan);
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef __MIC_MACADDR_H__
+#define __MIC_MACADDR_H__
+
+#define MAC_RUN_SHIFT 1
+#define MAC_DATE_SHIFT 16
+
+/**
+ * mic_get_mac_from_serial - Create MAC address from serial number string
+ * \param serial string containing serial number
+ * \param mac data space to place MAC address
+ * \param host if true set least significant bit for hosts MAC
+ *
+ * mic_get_mac_from_serial() creates a MAC address from a MIC host's serial number.
+ *
+ * A MAC address contains 6 bytes of which the first 3 are either assigned by IEEE
+ * or bit 2 of the first byte is set to indicate locally created. While awaiting
+ * our assigned values, the first the bytes have been set to 'MIC' with the local
+ * bit also being set and multicast not. The result is actually seeing "NIC".
+ *
+ * The last 3 bytes, or 24 bits are set in the pattern:
+ * o 8 bits are created by subtracting 1 from the cards year character mulitplied
+ * by the work week field. By subtracting 1 the year starts at 2012 and there
+ * is enough room to accout for MIC cards build through 2017
+ * o 15 bits are the work week running number from the serail number. This allows
+ * space for 32k of boards to be build in any one week.
+ * o 1 bit is used to indicated whether it is the host or card end of the virtual
+ * network connection. The bit being set is the card MAC address.
+ *
+ * Upon successful completion, mic_get_mac_from_serial returns zero. If the serial
+ * number does not have "KC" (for Knights Corner) as the 3rd and 4th characters
+ * then the serial number is invalid and a non zero value is returned.
+ */
+
+static int
+mic_get_mac_from_serial(char *serial, unsigned char *mac, int host)
+{
+ unsigned long final;
+ int y;
+ int ww;
+
+ if ((serial == NULL) || (serial[2] != 'K') || (serial[3] != 'C'))
+ return 1;
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,39)
+ y = kstrtoul(&serial[7], 10, &final); // y is to shutup Suse build
+#else
+ final = simple_strtoul(&serial[7], NULL, 10);
+#endif
+
+ final = final << MAC_RUN_SHIFT; /* Card side will add one */
+
+ y = (serial[4] - '1'); /* start year 2012 end year 2016 */
+ ww = ((serial[5] - '0') * 10) + (serial[6] - '0');
+
+ final += (y * ww) << MAC_DATE_SHIFT;
+
+ if (host) /* least bit indicates host MAC */
+ final++;
+
+ mac[0] = 0x4c;
+ mac[1] = 0x79;
+ mac[2] = 0xba;
+ mac[3] = (final >> 16) & 0xff;
+ mac[4] = (final >> 8) & 0xff;
+ mac[5] = final & 0xff;
+ return 0;
+}
+
+#endif /* __MIC_MACADDR_H__ */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/* common power management specific header defines for host and card */
+
+#include "io_interface.h"
+
+#if !defined(__MIC_PM_H)
+#define __MIC_PM_H
+
+#define PC6_TIMER 10
+
+#define IOCTL_PM_SendIoctl _IOC(_IOC_READ|_IOC_WRITE, 'l', 2, 0)
+
+#define MAX_HW_IDLE_WAIT_COUNT 100
+#define PC3_EXIT_WAIT_COUNT 1000
+#define PM_SEND_MODE SCIF_SEND_BLOCK
+#define PM_RECV_MODE SCIF_RECV_BLOCK
+#define SET_VID_RETRY_COUNT 3
+
+#define PM_NODE_MAGIC_BIT 31
+#define PM_NODE_IDLE (1 << PM_NODE_MAGIC_BIT)
+
+#define PM_PRINT(fmt, ...) printk("[ %s : %d ]:"fmt, \
+ __func__, __LINE__, ##__VA_ARGS__)
+
+#define PM_DEBUG(fmt, ...) pr_debug("[ %s : %d ]:"fmt, \
+ __func__, __LINE__, ##__VA_ARGS__)
+
+#define PM_ENTRY PM_DEBUG("==> %s\n", __func__)
+#define PM_EXIT PM_DEBUG("<== %s\n", __func__)
+#define PM_MAJOR_VERSION 1
+#define PM_MINOR_VERSION 0
+
+
+typedef enum _PM_MESSAGE {
+ PM_MESSAGE_PC3READY,
+ PM_MESSAGE_OPEN,
+ PM_MESSAGE_OPEN_ACK,
+ PM_MESSAGE_CLOSE,
+ PM_MESSAGE_CLOSE_ACK,
+ PM_MESSAGE_TEST,
+ PM_MESSAGE_MAX,
+} PM_MESSAGE;
+
+typedef enum _PM_IDLE_STATE {
+ PM_IDLE_STATE_PC0,
+ PM_IDLE_STATE_PC3_READY,
+ PM_IDLE_STATE_PC3,
+ PM_IDLE_STATE_PC6,
+ PM_IDLE_STATE_LOST,
+ PM_IDLE_STATE_MAX,
+} PM_IDLE_STATE;
+
+#ifndef _MIC_SCIF_
+typedef enum {
+ IOCTL_pm_send,
+ IOCTL_pm_recv,
+ IOCTL_pm_send_check,
+ IOCTL_pm_get_idle_state,
+ IOCTL_pm_exit_idle_state,
+ // For emulator testing
+ IOCTL_pmemu_pc3_entry,
+ IOCTL_pmemu_pc3_exit,
+ IOCTL_pmemu_pc6_entry,
+ IOCTL_pmemu_pc6_exit,
+ IOCTL_pmemu_dpc3_entry,
+ IOCTL_pmemu_dpc3_exit,
+ IOCTL_get_dependency_graph,
+ IOCTL_get_dependency_set,
+ IOCTL_pm_toggle_connection,
+ IOCTL_pm_idlestate_exit,
+ IOCTL_pm_enable_dpc3_testing,
+ IOCTL_pm_device_restart,
+} PM_IOCTL_TYPE;
+
+struct pm_ioctl_header {
+ uint32_t node;
+ PM_IOCTL_TYPE opcode;
+ uint64_t arglen;
+};
+#define PM_TEST_MSG_BODY "PM Test Message"
+#endif
+
+//Generic PM Header. Has message type and length of message.
+typedef struct _pm_msg_header {
+ PM_MESSAGE opcode;
+ uint32_t len;
+} pm_msg_header;
+
+typedef struct _pm_msg_unit_test
+{
+ pm_msg_header header;
+ void * buf;
+} pm_msg_unit_test;
+
+typedef struct _pm_version
+{
+ uint16_t major_version;
+ uint16_t minor_version;
+
+} pm_version;
+
+typedef struct _pm_msg_pm_options
+{
+ uint8_t pc3_enabled;
+ uint8_t pc6_enabled;
+ pm_version version;
+} pm_msg_pm_options;
+
+#ifndef _MIC_SCIF_
+// PM IOCTLs
+struct pm_scif_send {
+ struct pm_ioctl_header header;
+ uint32_t length;
+ void *buf;
+};
+
+struct pm_scif_recv {
+ struct pm_ioctl_header header;
+ uint32_t length;
+ void *buf;
+};
+
+struct pm_scif_send_check {
+ struct pm_ioctl_header header;
+ uint32_t length;
+ void *buf;
+};
+
+typedef struct pm_get_idle_state {
+ struct pm_ioctl_header header;
+ PM_IDLE_STATE *idle_state;
+} pm_get_idle_state_t;
+
+typedef struct pm_exit_idle_state {
+ struct pm_ioctl_header header;
+ PM_IDLE_STATE idle_state;
+}pm_exit_idlestate_t;
+
+typedef struct dependency_graph {
+ struct pm_ioctl_header header;
+ uint32_t** depmtrx;
+} dependency_graph_t;
+
+struct io_dependency_set {
+ struct pm_ioctl_header header;
+ int is_active_set;
+ uint64_t dep_set;
+};
+
+struct io_enable_dpc3_test {
+ struct pm_ioctl_header header;
+ uint32_t enable_test;
+ uint32_t state;
+};
+
+typedef struct _pm_status {
+ uint32_t hoststate_reg;
+ uint32_t cardstate_reg;
+ uint32_t c3waketimer_reg;
+ uint32_t pcucontrol_reg;
+ uint32_t uos_pcucontrol_reg;
+ uint32_t corevolt_reg;
+ uint32_t gpmctrl_reg;
+ uint32_t idle_state;
+ uint32_t board_id;
+} pm_status_t;
+
+typedef struct _test_msg_ctrl {
+ uint32_t action;
+} test_msg_ctrl_t;
+
+typedef struct _connection_info {
+ int32_t conn_state;
+ int32_t local_port;
+ int32_t local_node;
+ int32_t remote_port;
+ int32_t remote_node;
+ int32_t num_messages_queued;
+} connection_info_t;
+
+#endif //_MIC_SCIF_
+
+#if defined(CONFIG_MK1OM)
+
+#define SBOX_SVID_CONTROL 0x00004110
+#define SBOX_PCU_CONTROL 0x00004114
+#define SBOX_HOST_PMSTATE 0x00004118
+#define SBOX_UOS_PMSTATE 0x0000411c
+#define SBOX_C3WAKEUP_TIMER 0x00004120
+#define GBOX_PM_CTRL 0x0000413C
+#define SBOX_UOS_PCUCONTROL 0x0000412C
+
+#elif defined(CONFIG_ML1OM) || defined(WINDOWS)
+
+#define DBOX_SWFOX1 0x00002414
+#define DBOX_SWFOX2 0x00002418
+#define DBOX_SWFOX3 0x0000241C
+#define DBOX_SWFOX4 0x00002420
+#define DBOX_SWFOX5 0x00002424
+#define DBOX_SWFOX6 0x00002428
+#define DBOX_SWFOX7 0x0000242C
+#define DBOX_SWF0X8 0x00002430
+
+#define SBOX_SVID_CONTROL DBOX_SWFOX1
+#define SBOX_PCU_CONTROL DBOX_SWFOX2
+#define SBOX_HOST_PMSTATE DBOX_SWFOX3
+#define SBOX_UOS_PMSTATE DBOX_SWFOX4
+#define SBOX_C3WAKEUP_TIMER DBOX_SWFOX5
+#define GBOX_PM_CTRL DBOX_SWFOX6
+#define SBOX_UOS_PCUCONTROL DBOX_SWFOX7
+
+#else
+#error Neither CONFIG_ML1OM nor CONFIG_MK1OM defined
+#endif
+
+#define SBOX_SVIDCTRL_SVID_DOUT(x) ((x) & 0x1ff)
+#define SBOX_SVIDCTRL_SVID_DOUT_BITS(x) ((x) & 0x1ff)
+#define SBOX_SVIDCTRL_SVID_CMD(x) (((x) >> 9) & 0x1ff)
+#define SBOX_SVIDCTRL_SVID_CMD_BITS(x) (((x) & 0x1ff) << 9)
+#define SBOX_SVIDCTRL_SVID_DIN(x) (((x) >> 18) & 0x3ff)
+#define SBOX_SVIDCTRL_SVID_ERROR(x) (((x) >> 29) & 0x1)
+#define SBOX_SVIDCTRL_SVID_IDLE(x) (((x) >> 30) & 0x1)
+#define SBOX_SVIDCTRL_CMD_START(x) (((x) >> 31) & 0x1)
+#define SBOX_SVIDCTRL_CMD_START_BITS(x) (((x) & 0x1) << 31)
+// This is not a register field, but we need to check these bits to determine parity error
+#define SBOX_SVIDCTRL_ACK1ACK0(x) (((x) >> 27) & 0x11)
+
+#define SBOX_PCUCTRL_ENABLE_MCLK_SHUTDWN(x) ((x) & 0x1)
+#define SBOX_PCUCTRL_ENABLE_MCLK_SHUTDWN_BITS(x) ((x) & 0x1)
+#define SBOX_PCUCTRL_RING_ACTIVE(x) (((x) >> 2) & 0x1)
+#define SBOX_PCUCTRL_RING_ACTIVE_BITS(x) (((x) & 0x1) << 2)
+#define SBOX_PCUCTRL_PREVENT_AUTOC3_EXIT(x) (((x) >> 3) & 0x1)
+#define SBOX_PCUCTRL_PREVENT_AUTOC3_EXIT_BITS(x) (((x) & 0x1) << 3)
+#define SBOX_PCUCTRL_PWRGOOD_MASK(x) (((x) >> 17) & 0x1)
+#define SBOX_PCUCTRL_PWRGOOD_MASK_BITS(x) (((x) & 0x1) << 17)
+#define SBOX_PCUCTRL_MCLK_PLL_LCK(x) (((x) >> 16) & 0x1)
+#define SBOX_THERMAL_STS_ALERT_LOG(x) (((x) >> 3) & 0x1)
+#define SBOX_THERMAL_STS_ALERT_LOG_BITS(x) (((x) & 0x1) << 3)
+
+// used by host to communicate card idle state to uos
+#define SBOX_HPMSTATE_STATUS(x) ((x) & 0xff)
+#define SBOX_HPMSTATE_STATUS_BITS(x) ((x) & 0xff)
+#define SBOX_HPMSTATE_MINVID(x) (((x) >> 8) & 0xff)
+#define SBOX_HPMSTATE_TDPVID(x) (((x) >> 16) & 0xff)
+// used by uos to communicate card idle state to host
+#define SBOX_UPMSTATE_STATUS(x) ((x) & 0xff)
+#define SBOX_UPMSTATE_STATUS_BITS(x) ((x) & 0xff)
+
+#define SBOX_C3WAKEUP_TIME(x) ((x) & 0xffff)
+#define SBOX_C3WAKEUP_TIME_BITS(x) ((x) & 0xffff)
+
+#define IN_PCKGC6_BITS(x) (((x) & 0x1) << 1)
+#define KNC_SVID_ADDR 0
+#define KNC_SETVID_FAST 1
+#define KNC_SETVID_SLOW 2
+#define KNC_SETVID_ATTEMPTS 50
+
+
+typedef union _sbox_pcu_ctrl {
+ uint32_t value;
+ struct {
+ uint32_t enable_mclk_pl_shutdown :1;
+ uint32_t mclk_enabled :1;
+ uint32_t ring_active :1;
+ uint32_t prevent_auto_c3_exit :1;
+ uint32_t ghost_active :1;
+ uint32_t tcu_active :1;
+ uint32_t itp_scllk_gate_disable :1;
+ uint32_t itp_pkg_c3_disable :1;
+ uint32_t scratch :1;
+ uint32_t unallocated_1 :1;
+ uint32_t sysint_active :1;
+ uint32_t sclk_grid_off_disable :1;
+ uint32_t icc_dvo_ssc_cg_enable :1;
+ uint32_t icc_core_ref_clk_cg_enable :1;
+ uint32_t icc_gddr_ssc_cg_enable :1;
+ uint32_t icc_pll_disable :1;
+ uint32_t mclk_pll_lock :1;
+ uint32_t grpB_pwrgood_mask :1;
+ uint32_t unallocated_2 :14;
+ } bits;
+
+} sbox_pcu_ctrl_t;
+
+typedef union _sbox_host_pm_state {
+ uint32_t value;
+ struct {
+ uint32_t host_pm_state :7;
+ uint32_t abort_not_processed :1;
+ uint32_t min_vid :8;
+ uint32_t tdp_vid :8;
+ uint32_t unallocated :8;
+ } bits;
+
+} sbox_host_pm_state_t;
+
+typedef union _sbox_uos_pm_state {
+ uint32_t value;
+ struct {
+ uint32_t uos_pm_state :8;
+ uint32_t unallocated :24;
+ }bits;
+
+} sbox_uos_pm_state_t;
+
+typedef union _c3_wakeup_timer {
+ uint32_t value;
+ struct {
+ uint32_t c3_wake_time :16;
+ uint32_t unallocated_1 :1;
+ uint32_t c3_wake_timeout :1;
+ uint32_t unallocated_2 :14;
+ } bits;
+
+} c3_wakeup_timer_t;
+
+typedef union _sbox_svid_control {
+ uint32_t value;
+ struct {
+ uint32_t svid_dout :9;
+ uint32_t svid_cmd :9;
+ uint32_t svid_din :11;
+ uint32_t svid_error :1;
+ uint32_t svid_idle :1;
+ uint32_t cmd_start :1;
+ } bits;
+
+} sbox_svid_control;
+
+typedef union _gbox_pm_control {
+ uint32_t value;
+ struct {
+ uint32_t c6_disable :1;
+ uint32_t in_pckgc6 :1;
+ uint32_t gbox_inM3 :2;
+ uint32_t unallocated :28;
+ } bits;
+
+} gbox_pm_control;
+
+typedef union _sbox_thermal_sts_interrupt {
+ uint32_t value;
+ struct {
+ uint32_t mclk_ratio_status :1;
+ uint32_t mclk_ratio_log :1;
+ uint32_t alert_status :1;
+ uint32_t alert_log :1;
+ uint32_t gpu_hot_status :1;
+ uint32_t gpu_hot_log :1;
+ uint32_t pwr_alert_status :1;
+ uint32_t pwr_alert_log :1;
+ uint32_t pmu_status :1;
+ uint32_t pmu_log :1;
+ uint32_t etc_freeze :1;
+ uint32_t unallocated :21;
+ }bits;
+
+} sbox_thermal_sts_interrupt;
+
+typedef union _sboxUosPcucontrolReg
+{
+ uint32_t value;
+ struct
+ {
+ uint32_t c3_wakeuptimer_enable :1;
+ uint32_t enable_mclk_pll_shutdown :1;
+ uint32_t spi_clk_disable :1;
+ uint32_t unallocated :29;
+ } bits;
+
+} sbox_uos_pcu_ctrl_t;
+
+typedef union _sboxCorefreqReg
+{
+ uint32_t value;
+ struct
+ {
+ uint32_t ratio :12; // bit 0-11 Ratio
+ uint32_t rsvd0 : 3; // bit 12-14
+ uint32_t fuseratio : 1; // bit 15 If overclocking is enabled, setting this bit will default the goal ratio to the fuse value.
+ uint32_t asyncmode : 1; // bit 16 Async Mode Bit 16, Reserved Bits 20:17 used to be ExtClkFreq,
+ uint32_t rsvd1 : 9; // bit 17-25
+ uint32_t ratiostep : 4; // bit 26-29 Power throttle ratio-step
+ uint32_t jumpratio : 1; // bit 30 Power throttle jump at once
+ uint32_t booted : 1; // bit 31 Booted: This bit selects between the default MCLK Ratio (600MHz) and the programmable MCLK ratio. 0=default 1=programmable.
+ } bits;
+
+} sbox_core_freq_t;
+
+typedef union _sboxCoreVoltReg
+{
+ uint32_t value;
+ struct
+ {
+ uint32_t vid :8;
+ uint32_t unallocated :24;
+ } bits;
+
+} sbox_core_volt_t;
+
+typedef enum _PM_CONNECTION_STATE {
+ PM_CONNECTING,
+ PM_CONNECTED,
+ PM_DISCONNECTING,
+ PM_DISCONNECTED
+} PM_CONNECTION_STATE;
+
+#endif //__MIC_PM_H
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MIC_SBOX_MD_H
+#define MIC_SBOX_MD_H
+/*
+ * TODO: SBOX MCA Handling
+ */
+#ifdef _MIC_SCIF_
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+#endif // _MIC_SCIF_
+
+#ifdef _MIC_SCIF_
+void *mic_sbox_md_init(void);
+void mic_sbox_md_uninit(void *mic_sbox_mmio_va);
+#endif
+
+static inline uint32_t mic_sbox_read_mmio(void *mic_sbox_mmio_va, uint32_t offset)
+{
+ return readl((uint8_t *)mic_sbox_mmio_va + offset);
+}
+
+static inline void mic_sbox_write_mmio(void *mic_sbox_mmio_va, uint32_t offset, uint32_t value)
+{
+ writel(value, (uint8_t *)mic_sbox_mmio_va + offset);
+}
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ Structures which are passed from host to MIC card through
+ uOS kernel command line option, virtio_addr.
+
+ (C) Copyright 2012 Intel Corporation
+ Author: Caz Yokoyama <Caz.Yokoyama@intel.com>
+ */
+#ifndef MIC_VIRTIO_H
+#define MIC_VIRTIO_H
+
+struct vb_shared {
+ uint32_t host_features;
+ uint32_t client_features;
+ bool update;
+ struct vring vring;
+ struct virtio_blk_config blk_config;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0))
+ uint32_t unused;
+#endif
+} __attribute__((aligned(8)));
+
+struct mic_virtblk {
+#ifdef HOST
+ struct vb_shared vb_shared;
+ void *vblk; /* keep vblk in vhost for virtblk */
+#else
+ struct vb_shared *vb_shared;
+ void *vdev; /* keep vdev in virtio for virtblk */
+#endif
+};
+
+uint64_t mic_vhost_pm_disconnect_node(uint64_t node_bitmask, enum disconn_type type);
+void mic_vhost_blk_stop(bd_info_t *bd_info);
+
+#endif // MIC_VIRTIO_H
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/* "Raw" register offsets & bit specifications for MIC */
+#ifndef _MIC_MICBASEDEFINE_REGISTERS_H_
+#define _MIC_MICBASEDEFINE_REGISTERS_H_
+
+#define COMMON_MMIO_BOX_SIZE (1<<16)
+
+/* CBOX register base defines */
+#define CBOX_BASE 0x0000000000ULL
+
+/* TXS register base defines */
+#define TXS0_BASE 0x0800780000ULL
+#define TXS1_BASE 0x0800770000ULL
+#define TXS2_BASE 0x0800760000ULL
+#define TXS3_BASE 0x0800750000ULL
+#define TXS4_BASE 0x0800740000ULL
+#define TXS5_BASE 0x0800730000ULL
+#define TXS6_BASE 0x0800720000ULL
+#define TXS7_BASE 0x0800710000ULL
+#define TXS8_BASE 0x08006E0000ULL
+
+/* GBOX register base defines */
+#define GBOX0_BASE 0x08007A0000ULL
+#define GBOX1_BASE 0x0800790000ULL
+#define GBOX2_BASE 0x0800700000ULL
+#define GBOX3_BASE 0x08006F0000ULL
+
+#define GBOX_CHANNEL0_BASE 0x00000000
+#define GBOX_CHANNEL1_BASE 0x00000800
+#define GBOX_CHANNEL2_BASE 0x00001000
+
+/* VBOX register base defines */
+#define VBOX_BASE 0x08007B0000ULL
+
+/* DBOX register base defines */
+#define DBOX_BASE 0x08007C0000ULL
+
+/* SBOX register base defines */
+#define SBOX_BASE 0x08007D0000ULL
+
+#define MIC_GTT_BASE 0x0800800000ULL
+#define MIC_GTT_TOP 0x080083FFFFULL
+#define MIC_GTT_SIZE (MIC_GTT_TOP - MIC_GTT_BASE + 1)
+
+/* Aperture defines */
+#define MIC_APERTURE_BASE 0x0900000000ULL
+#define MIC_APERTURE_TOP 0x090FFFFFFFULL
+#define MIC_APERTURE_SIZE (MIC_APERTURE_TOP - MIC_APERTURE_BASE + 1)
+
+/* SPI flash defines */
+#define MIC_SPI_BOOTLOADER_BASE 0x0FFFFF0000ULL
+#define MIC_SPI_BOOTLOADER_TOP 0x0FFFFFFFFFULL
+#define MIC_SPI_BOOTLOADER_SIZE (MIC_SPI_BOOTLOADER_TOP - MIC_SPI_BOOTLOADER_BASE + 1)
+#define MIC_SPI_2ND_STAGE_BASE 0x0FFFFE0000ULL
+#define MIC_SPI_2ND_STAGE_TOP 0x0FFFFEFFFFULL
+#define MIC_SPI_2ND_STAGE_SIZE (MIC_SPI_2ND_STAGE_TOP - MIC_SPI_2ND_STAGE_BASE + 1)
+#define MIC_SPI_PARAMETER_BASE 0x0FFFFDC000ULL
+#define MIC_SPI_PARAMETER_TOP 0x0FFFFDFFFFULL
+#define MIC_SPI_PARAMETER_SIZE (MIC_SPI_PARAMETER_TOP - MIC_SPI_PARAMETER_BASE + 1)
+
+/* remote defines */
+#define MIC_REMOTE_BASE 0x1000000000ULL
+#define MIC_REMOTE_TOP 0x7FFFFFFFFFULL
+#define MIC_REMOTE_SIZE (MIC_REMOTE_TOP - MIC_REMOTE_BASE + 1)
+
+/* system defines */
+#define MIC_SYSTEM_BASE 0x8000000000ULL
+#define MIC_SYSTEM_TOP 0xFFFFFFFFFFULL
+#define MIC_SYSTEM_PAGE_SIZE 0x0400000000ULL
+#define MIC_SYSTEM_SIZE (MIC_SYSTEM_TOP - MIC_SYSTEM_BASE + 1)
+
+#define MIC_PHYSICAL_ADDRESS_BITS 40
+#define MIC_PHYSICAL_ADDRESS_SPACE_SIZE ( 1ULL << MIC_PHYSICAL_ADDRESS_BITS )
+
+#define MIC_HOST_MMIO_BASE DBOX_BASE
+
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/* "Raw" register offsets & bit specifications for Intel MIC (KNF) */
+#ifndef _MIC_DBOXDEFINE_REGISTERS_H_
+#define _MIC_DBOXDEFINE_REGISTERS_H_
+
+#define DBOX_SWF0X0 0x00002410
+
+
+#define DBOX_SWF1X0 0x00003410
+#define DBOX_SWF1X1 0x00003414
+#define DBOX_SWF1X2 0x00003418
+#define DBOX_SWF1X3 0x0000341C
+
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef _MIC_PSMI_H
+#define _MIC_PSMI_H
+
+struct mic_psmi_pte {
+ uint64_t pa;
+};
+
+struct mic_psmi_ctx
+{
+ unsigned char enabled;
+
+ struct mic_psmi_pte *dma_tbl;
+ int dma_tbl_size;
+ dma_addr_t dma_tbl_hndl;
+ uint64_t dma_mem_size;
+ int nr_dma_pages;
+
+ struct mic_psmi_pte *va_tbl;
+};
+
+#define MIC_PSMI_PAGE_ORDER (7)
+#define MIC_PSMI_PAGE_SIZE (PAGE_SIZE << MIC_PSMI_PAGE_ORDER)
+#define MIC_PSMI_SIGNATURE 0x4B434F52494D5350L
+
+int mic_psmi_open(struct file *filp);
+
+#endif /* _MIC_PSMI_H */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/* "Raw" register offsets & bit specifications for Intel MIC (KNF) */
+#ifndef _MIC_SBOXDEFINE_REGISTERS_H_
+#define _MIC_SBOXDEFINE_REGISTERS_H_
+
+
+#define SBOX_OC_I2C_ICR 0x00001000
+#define SBOX_THERMAL_STATUS 0x00001018
+#define SBOX_THERMAL_INTERRUPT_ENABLE 0x0000101C
+#define SBOX_STATUS_FAN1 0x00001024
+#define SBOX_STATUS_FAN2 0x00001028
+#define SBOX_SPEED_OVERRIDE_FAN 0x0000102C
+#define SBOX_BOARD_TEMP1 0x00001030
+#define SBOX_BOARD_TEMP2 0x00001034
+#define SBOX_BOARD_VOLTAGE_SENSE 0x00001038
+#define SBOX_CURRENT_DIE_TEMP0 0x0000103C
+#define SBOX_CURRENT_DIE_TEMP1 0x00001040
+#define SBOX_CURRENT_DIE_TEMP2 0x00001044
+#define SBOX_MAX_DIE_TEMP0 0x00001048
+#define SBOX_MAX_DIE_TEMP1 0x0000104C
+#define SBOX_MAX_DIE_TEMP2 0x00001050
+#define SBOX_ELAPSED_TIME_LOW 0x00001074
+#define SBOX_ELAPSED_TIME_HIGH 0x00001078
+#define SBOX_FAIL_SAFE_OFFSET 0x00002004
+#define SBOX_CURRENT_CLK_RATIO 0x00003004
+#define SBOX_SMPT00 0x00003100
+#define SBOX_SMPT02 0x00003108
+#define SBOX_RGCR 0x00004010
+#define SBOX_DSTAT 0x00004014
+#define SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8 0x00005808
+#define SBOX_PCIE_BAR_ENABLE 0x00005CD4
+#define SBOX_SICR0 0x00009004
+#define SBOX_SICE0 0x0000900C
+#define SBOX_SICC0 0x00009010
+#define SBOX_SICR1 0x0000901C
+#define SBOX_SICC1 0x00009028
+#ifdef CONFIG_MK1OM
+#define SBOX_PMU_PERIOD_SEL 0x00001070
+#define SBOX_THERMAL_STATUS_INTERRUPT 0x0000107C
+#define SBOX_THERMAL_STATUS_2 0x00001080
+#define SBOX_THERMAL_TEST_2 0x00001084
+#define SBOX_COREFREQ 0x00004100
+#define SBOX_COREVOLT 0x00004104
+#define SBOX_MEMORYFREQ 0x00004108
+#define SBOX_MEMVOLT 0x0000410C
+//add defines used by drivers that are the same as DOORBELL_INTX
+#define SBOX_SDBIC0 0x0000CC90
+#define SBOX_SDBIC1 0x0000CC94
+#define SBOX_SDBIC2 0x0000CC98
+#define SBOX_SDBIC3 0x0000CC9C
+#else
+#define SBOX_SDBIC0 0x00009030
+#define SBOX_SDBIC1 0x00009034
+#define SBOX_SDBIC2 0x00009038
+#define SBOX_SDBIC3 0x0000903C
+#define SBOX_COREFREQ 0x00004040
+#define SBOX_COREVOLT 0x00004044
+#define SBOX_MEMORYFREQ 0x00004048
+#define SBOX_MEMVOLT 0x0000404C
+#define SBOX_RSC0 0x0000CC10
+#define SBOX_RSC1 0x0000CC14
+
+#endif
+#define SBOX_MXAR0 0x00009040
+#define SBOX_MXAR0_K1OM 0x00009044
+#define SBOX_MXAR1 0x00009044
+#define SBOX_MXAR2 0x00009048
+#define SBOX_MXAR3 0x0000904C
+#define SBOX_MXAR4 0x00009050
+#define SBOX_MXAR5 0x00009054
+#define SBOX_MXAR6 0x00009058
+#define SBOX_MXAR7 0x0000905C
+#define SBOX_MXAR8 0x00009060
+#define SBOX_MXAR9 0x00009064
+#define SBOX_MXAR10 0x00009068
+#define SBOX_MXAR11 0x0000906C
+#define SBOX_MXAR12 0x00009070
+#define SBOX_MXAR13 0x00009074
+#define SBOX_MXAR14 0x00009078
+#define SBOX_MXAR15 0x0000907C
+#define SBOX_MSIXPBACR 0x00009080
+#define SBOX_MSIXPBACR_K1OM 0x00009084
+#define SBOX_DCAR_0 0x0000A000
+#define SBOX_DHPR_0 0x0000A004
+#define SBOX_DTPR_0 0x0000A008
+#define SBOX_DAUX_LO_0 0x0000A00C
+#define SBOX_DAUX_HI_0 0x0000A010
+#define SBOX_DRAR_LO_0 0x0000A014
+#define SBOX_DRAR_HI_0 0x0000A018
+#define SBOX_DITR_0 0x0000A01C
+#define SBOX_DSTAT_0 0x0000A020
+#define SBOX_DSTATWB_LO_0 0x0000A024
+#define SBOX_DSTATWB_HI_0 0x0000A028
+#define SBOX_DCHERR_0 0x0000A02C
+#define SBOX_DCHERRMSK_0 0x0000A030
+#define SBOX_DCAR_1 0x0000A040
+#define SBOX_DHPR_1 0x0000A044
+#define SBOX_DTPR_1 0x0000A048
+#define SBOX_DAUX_LO_1 0x0000A04C
+#define SBOX_DAUX_HI_1 0x0000A050
+#define SBOX_DRAR_LO_1 0x0000A054
+#define SBOX_DRAR_HI_1 0x0000A058
+#define SBOX_DITR_1 0x0000A05C
+#define SBOX_DSTAT_1 0x0000A060
+#define SBOX_DSTATWB_LO_1 0x0000A064
+#define SBOX_DSTATWB_HI_1 0x0000A068
+#define SBOX_DCHERR_1 0x0000A06C
+#define SBOX_DCHERRMSK_1 0x0000A070
+#define SBOX_DCAR_2 0x0000A080
+#define SBOX_DHPR_2 0x0000A084
+#define SBOX_DTPR_2 0x0000A088
+#define SBOX_DAUX_LO_2 0x0000A08C
+#define SBOX_DAUX_HI_2 0x0000A090
+#define SBOX_DRAR_LO_2 0x0000A094
+#define SBOX_DRAR_HI_2 0x0000A098
+#define SBOX_DITR_2 0x0000A09C
+#define SBOX_DSTAT_2 0x0000A0A0
+#define SBOX_DSTATWB_LO_2 0x0000A0A4
+#define SBOX_DSTATWB_HI_2 0x0000A0A8
+#define SBOX_DCHERR_2 0x0000A0AC
+#define SBOX_DCHERRMSK_2 0x0000A0B0
+#define SBOX_DCAR_3 0x0000A0C0
+#define SBOX_DHPR_3 0x0000A0C4
+#define SBOX_DTPR_3 0x0000A0C8
+#define SBOX_DAUX_LO_3 0x0000A0CC
+#define SBOX_DAUX_HI_3 0x0000A0D0
+#define SBOX_DRAR_LO_3 0x0000A0D4
+#define SBOX_DRAR_HI_3 0x0000A0D8
+#define SBOX_DITR_3 0x0000A0DC
+#define SBOX_DSTAT_3 0x0000A0E0
+#define SBOX_DSTATWB_LO_3 0x0000A0E4
+#define SBOX_DSTATWB_HI_3 0x0000A0E8
+#define SBOX_DCHERR_3 0x0000A0EC
+#define SBOX_DCHERRMSK_3 0x0000A0F0
+#define SBOX_DCAR_4 0x0000A100
+#define SBOX_DHPR_4 0x0000A104
+#define SBOX_DTPR_4 0x0000A108
+#define SBOX_DAUX_LO_4 0x0000A10C
+#define SBOX_DAUX_HI_4 0x0000A110
+#define SBOX_DRAR_LO_4 0x0000A114
+#define SBOX_DRAR_HI_4 0x0000A118
+#define SBOX_DITR_4 0x0000A11C
+#define SBOX_DSTAT_4 0x0000A120
+#define SBOX_DSTATWB_LO_4 0x0000A124
+#define SBOX_DSTATWB_HI_4 0x0000A128
+#define SBOX_DCHERR_4 0x0000A12C
+#define SBOX_DCHERRMSK_4 0x0000A130
+#define SBOX_DCAR_5 0x0000A140
+#define SBOX_DHPR_5 0x0000A144
+#define SBOX_DTPR_5 0x0000A148
+#define SBOX_DAUX_LO_5 0x0000A14C
+#define SBOX_DAUX_HI_5 0x0000A150
+#define SBOX_DRAR_LO_5 0x0000A154
+#define SBOX_DRAR_HI_5 0x0000A158
+#define SBOX_DITR_5 0x0000A15C
+#define SBOX_DSTAT_5 0x0000A160
+#define SBOX_DSTATWB_LO_5 0x0000A164
+#define SBOX_DSTATWB_HI_5 0x0000A168
+#define SBOX_DCHERR_5 0x0000A16C
+#define SBOX_DCHERRMSK_5 0x0000A170
+#define SBOX_DCAR_6 0x0000A180
+#define SBOX_DHPR_6 0x0000A184
+#define SBOX_DTPR_6 0x0000A188
+#define SBOX_DAUX_LO_6 0x0000A18C
+#define SBOX_DAUX_HI_6 0x0000A190
+#define SBOX_DRAR_LO_6 0x0000A194
+#define SBOX_DRAR_HI_6 0x0000A198
+#define SBOX_DITR_6 0x0000A19C
+#define SBOX_DSTAT_6 0x0000A1A0
+#define SBOX_DSTATWB_LO_6 0x0000A1A4
+#define SBOX_DSTATWB_HI_6 0x0000A1A8
+#define SBOX_DCHERR_6 0x0000A1AC
+#define SBOX_DCHERRMSK_6 0x0000A1B0
+#define SBOX_DCAR_7 0x0000A1C0
+#define SBOX_DHPR_7 0x0000A1C4
+#define SBOX_DTPR_7 0x0000A1C8
+#define SBOX_DAUX_LO_7 0x0000A1CC
+#define SBOX_DAUX_HI_7 0x0000A1D0
+#define SBOX_DRAR_LO_7 0x0000A1D4
+#define SBOX_DRAR_HI_7 0x0000A1D8
+#define SBOX_DITR_7 0x0000A1DC
+#define SBOX_DSTAT_7 0x0000A1E0
+#define SBOX_DSTATWB_LO_7 0x0000A1E4
+#define SBOX_DSTATWB_HI_7 0x0000A1E8
+#define SBOX_DCHERR_7 0x0000A1EC
+#define SBOX_DCHERRMSK_7 0x0000A1F0
+#define SBOX_DCR 0x0000A280
+#define SBOX_APICICR0 0x0000A9D0
+#define SBOX_APICICR1 0x0000A9D8
+#define SBOX_APICICR2 0x0000A9E0
+#define SBOX_APICICR3 0x0000A9E8
+#define SBOX_APICICR4 0x0000A9F0
+#define SBOX_APICICR5 0x0000A9F8
+#define SBOX_APICICR6 0x0000AA00
+#define SBOX_APICICR7 0x0000AA08
+#define SBOX_SCRATCH0 0x0000AB20
+#define SBOX_SCRATCH1 0x0000AB24
+#define SBOX_SCRATCH2 0x0000AB28
+#define SBOX_SCRATCH3 0x0000AB2C
+#define SBOX_SCRATCH4 0x0000AB30
+#define SBOX_SCRATCH5 0x0000AB34
+#define SBOX_SCRATCH6 0x0000AB38
+#define SBOX_SCRATCH7 0x0000AB3C
+#define SBOX_SCRATCH8 0x0000AB40
+#define SBOX_SCRATCH9 0x0000AB44
+#define SBOX_SCRATCH10 0x0000AB48
+#define SBOX_SCRATCH11 0x0000AB4C
+#define SBOX_SCRATCH12 0x0000AB50
+#define SBOX_SCRATCH13 0x0000AB54
+#define SBOX_SCRATCH14 0x0000AB58
+#define SBOX_SCRATCH15 0x0000AB5C
+#define SBOX_RDMASR0 0x0000B180
+#define SBOX_SBQ_FLUSH 0x0000B1A0 // Pseudo-register, not autogen, must add manually
+#define SBOX_TLB_FLUSH 0x0000B1A4
+#define SBOX_GTT_PHY_BASE 0x0000C118
+#define SBOX_EMON_CNT0 0x0000CC28
+#define SBOX_EMON_CNT1 0x0000CC2C
+#define SBOX_EMON_CNT2 0x0000CC30
+#define SBOX_EMON_CNT3 0x0000CC34
+
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MICSCIF_H
+#define MICSCIF_H
+
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+
+#ifdef _MODULE_SCIF_
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <asm/uaccess.h>
+#include <linux/poll.h>
+#include <linux/mmzone.h>
+#include <linux/version.h>
+#endif /* MODULE_SCIF */
+
+#include <linux/notifier.h>
+#include "scif.h"
+#include "mic/micbaseaddressdefine.h"
+#include "mic/micsboxdefine.h"
+
+/* The test runs in a separate thread context from the bottom
+ * half that processes messages from the card and setup p2p
+ * when these run concurrently, p2p messages get lost since they
+ * may be consumed by the test thread
+ */
+//#define ENABLE_TEST // Used to enable testing at board connect
+#ifdef MIC_IS_EMULATION
+#define TEST_LOOP 2
+#else
+#define TEST_LOOP 2000
+#endif
+
+//#define P2P_HACK 0
+#include "scif.h"
+#include "scif_ioctl.h"
+
+#define SCIF_READY_MAGIC_NUM 0x1eedfee0
+
+#ifndef SCIF_MAJOR
+#define SCIF_MAJOR 0 /* Use dynamic major number by default */
+#endif
+
+#define SCIF_HOST_NODE 0 // By default the host is always node zero
+
+#define SCIF_RMA_TEMP_CACHE_LIMIT 0x20000
+/*
+ * The overhead for proxying a P2P DMA read to convert it to
+ * a DMA write by sending a SCIF Node QP message has been
+ * seen to be higher than programming a P2P DMA Read on self
+ * for transfer sizes less than the PROXY_DMA_THRESHOLD.
+ * The minimum threshold is different for Jaketown versus
+ * Ivytown and tuned for best DMA performance.
+ */
+#define SCIF_PROXY_DMA_THRESHOLD_JKT (32 * 1024ULL)
+#define SCIF_PROXY_DMA_THRESHOLD_IVT (1024 * 1024ULL)
+
+//#define RMA_DEBUG 0
+
+/* Pre-defined L1_CACHE_SHIFT is 6 on RH and 7 on Suse */
+#undef L1_CACHE_SHIFT
+#define L1_CACHE_SHIFT 6
+#undef L1_CACHE_BYTES
+#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
+
+#define MI_EPLOCK_HELD (true)
+#define MAX_RDMASR 8
+
+// Device wide SCIF information
+struct micscif_info {
+ uint32_t mi_nodeid; // Node ID this node is to others.
+
+ struct mutex mi_conflock; // Configuration lock (used in p2p setup)
+ uint32_t mi_maxid; // Max known board ID
+ uint32_t mi_total; // Total number of running interfaces
+ uint32_t mi_nr_zombies; // Keep track of the number of zombie EP.
+ unsigned long mi_mask; // bit mask of online scif interfaces
+ uint64_t mi_nr_ioremap; // Keep track of number of ioremap() calls on the host
+ // to decide when to purge aliases for performance.
+ spinlock_t mi_eplock;
+ spinlock_t mi_connlock;
+ spinlock_t mi_rmalock; // Synchronize access to list of temporary registered
+ // windows to be destroyed.
+ struct mutex mi_fencelock; // Synchronize access to list of remote fences requested.
+ struct mutex mi_event_cblock;
+ spinlock_t mi_nb_connect_lock;
+
+ struct list_head mi_uaccept; // List of user acceptreq waiting for acceptreg
+ struct list_head mi_listen; // List of listening end points
+ struct list_head mi_zombie; // List of zombie end points with pending RMA's.
+ struct list_head mi_connected; // List of end points in connected state
+ struct list_head mi_disconnected; // List of end points in disconnected state
+ struct list_head mi_rma; // List of temporary registered windows to be destroyed.
+ struct list_head mi_rma_tc; // List of temporary
+ // registered & cached windows
+ // to be destroyed.
+ struct list_head mi_fence; // List of remote fence requests.
+ struct list_head mi_event_cb; /* List of event handlers registered */
+ struct list_head mi_nb_connect_list;
+#ifdef CONFIG_MMU_NOTIFIER
+ struct list_head mi_mmu_notif_cleanup;
+#endif
+ struct notifier_block mi_panic_nb;
+#ifndef _MIC_SCIF_
+ /* The host needs to keep track of node dependencies in form of graph.
+ * This will need to be dynamically grown to support hotplug.
+ */
+ uint32_t **mi_depmtrx;
+ /*
+ * Wait queue used for blocking while waiting for nodes
+ * to respond for disconnect message sent from host.
+ */
+ wait_queue_head_t mi_disconn_wq;
+ /* stus of node remove operation*/
+ uint64_t mi_disconnect_status;
+ atomic_long_t mi_unique_msgid;
+#endif
+ /*
+ * Watchdog timeout on the host. Timer expiry will result in the host
+ * treating the remote node as a lost node. Default value is
+ * DEFAULT_WATCHDOG_TO and can be modified to a value greater than 1
+ * second via SCIF sysfs watchdog_to entry.
+ */
+ int mi_watchdog_to; // Watchdog timeout
+ int mi_watchdog_enabled; // Watchdog timeout enabled
+ int mi_watchdog_auto_reboot; // Watchdog auto reboot enabled
+ struct workqueue_struct *mi_misc_wq; // Workqueue for miscellaneous SCIF tasks.
+ struct work_struct mi_misc_work;
+#ifdef CONFIG_MMU_NOTIFIER
+ struct workqueue_struct *mi_mmu_notif_wq; // Workqueue for MMU notifier cleanup tasks.
+ struct work_struct mi_mmu_notif_work;
+#endif
+ int nr_gtt_entries; // GTT Debug Counter to detect leaks
+ uint64_t nr_2mb_pages; // Debug Counter for number of 2mb pages.
+ uint64_t nr_4k_pages; // Debug Counter for number of 4K pages
+ uint8_t en_msg_log;
+ wait_queue_head_t mi_exitwq;
+ unsigned long mi_rma_tc_limit;
+ uint64_t mi_proxy_dma_threshold;
+#ifdef RMA_DEBUG
+ atomic_long_t rma_mm_cnt;
+ atomic_long_t rma_unaligned_cpu_cnt;
+ atomic_long_t rma_alloc_cnt;
+ atomic_long_t rma_pin_cnt;
+#ifdef CONFIG_MMU_NOTIFIER
+ atomic_long_t mmu_notif_cnt;
+#endif
+#endif
+#ifdef _MIC_SCIF_
+ int mi_intr_rcnt[MAX_RDMASR]; // Ref count to track SCIF Interrupt Handlers
+#endif
+ struct workqueue_struct *mi_conn_wq;
+ struct work_struct mi_conn_work;
+};
+
+extern struct micscif_info ms_info;
+
+#define SCIF_NODE_MAGIC_BIT 63
+/* Magic value used to indicate a remote idle node without grabbing any locks */
+#define SCIF_NODE_IDLE (1ULL << SCIF_NODE_MAGIC_BIT)
+
+enum scif_state {
+ SCIFDEV_NOTPRESENT,
+ SCIFDEV_INIT,
+ SCIFDEV_RUNNING,
+ SCIFDEV_SLEEPING,
+ SCIFDEV_STOPPING,
+ SCIFDEV_STOPPED
+};
+
+extern bool mic_p2p_enable;
+extern bool mic_p2p_proxy_enable;
+extern bool mic_reg_cache_enable;
+extern bool mic_ulimit_check;
+/* p2p mapping from node id to peer id */
+struct scif_p2p_info {
+ int ppi_peer_id;
+ struct scatterlist *ppi_sg[2];
+ uint64_t sg_nentries[2]; // no of entries in scatterlists
+ dma_addr_t ppi_pa[2]; // one for mmio; one for aper
+ dma_addr_t ppi_mic_addr[2]; // one for mmio; one for aper
+ uint64_t ppi_len[2];
+#define PPI_MMIO 0
+#define PPI_APER 1
+ enum scif_state ppi_disc_state; //Disconnection state of this peer node.
+ struct list_head ppi_list;
+};
+
+/* one per remote node */
+struct micscif_dev {
+ uint16_t sd_node;
+ enum scif_state sd_state;
+ volatile void *mm_sbox;
+ uint64_t sd_base_addr; /* Remote node's base bus addr
+ * for the local node's aperture
+ */
+#ifndef _MIC_SCIF_
+ struct list_head sd_p2p; /* List of bus addresses for
+ * other nodes, these are allocated
+ * by the host driver and are
+ * valid only on the host node
+ */
+ struct delayed_work sd_watchdog_work;
+ wait_queue_head_t sd_watchdog_wq;
+ struct workqueue_struct *sd_ln_wq;
+ char sd_ln_wqname[16];
+#endif
+
+ int n_qpairs; /* FIXME:
+ * This is always set to 1,
+ */
+
+ struct micscif_qp *qpairs; /* Same FIXME as above
+ * There is single qp established
+ * with this remote node
+ */
+
+ struct workqueue_struct *sd_intr_wq; /* sd_intr_wq & sd_intr_bh
+ * together constitute the workqueue
+ * infrastructure needed to
+ * run the bottom half handler
+ * for messages received from
+ * this node
+ */
+ char sd_intr_wqname[16];
+ struct work_struct sd_intr_bh;
+ unsigned int sd_intr_handle;
+ uint32_t sd_rdmasr;
+ struct workqueue_struct *sd_loopb_wq;
+ char sd_loopb_wqname[16];
+ struct work_struct sd_loopb_work;
+ struct list_head sd_loopb_recv_q;
+ /* Lock to synchronize remote node state transitions */
+ struct mutex sd_lock;
+ /*
+ * Global Ref count per SCIF device tracking all SCIF API's which
+ * might communicate across PCIe.
+ */
+ atomic_long_t scif_ref_cnt;
+ /*
+ * Global Ref count per SCIF device tracking scif_mmap()/
+ * scif_get_pages(). sd_lock protects scif_map_ref_cnt
+ * hence it does not need to be an atomic operation. Note that
+ * scif_mmap()/scif_get_pages() is not in the critical
+ * perf path.
+ */
+ int scif_map_ref_cnt;
+ /*
+ * Wait queue used for blocking while waiting for nodes
+ * to wake up or to be removed.
+ */
+ wait_queue_head_t sd_wq;
+ uint64_t sd_wait_status;
+#ifdef _MIC_SCIF_
+ wait_queue_head_t sd_p2p_wq;
+ bool sd_proxy_dma_reads;
+ struct delayed_work sd_p2p_dwork;
+ int sd_p2p_retry;
+#endif
+ /*
+ * The NUMA node the peer is attached to on the host.
+ */
+ int sd_numa_node;
+ /*
+ * Waitqueue for blocking while waiting for remote memory
+ * mappings to drop to zero.
+ */
+ wait_queue_head_t sd_mmap_wq;
+
+ /* When a nodeqp message is received, this is set.
+ * And it is reset by the watchdog time */
+ atomic_t sd_node_alive;
+ int num_active_conn;
+#ifdef ENABLE_TEST
+ struct workqueue_struct *producer;
+ struct workqueue_struct *consumer;
+ char producer_name[16];
+ char consumer_name[16];
+ struct work_struct producer_work;
+ struct work_struct consumer_work;
+ int count;
+ int test_done;
+#endif // ENABLE_TEST
+};
+
+extern struct micscif_dev scif_dev[];
+
+#include "mic/micscif_nodeqp.h"
+#include "mic/micscif_nm.h"
+#include "mic/micscif_smpt.h"
+#include "mic/micscif_va_gen.h"
+#include "mic/mic_dma_api.h"
+#include "mic/mic_dma_lib.h"
+#include "mic/micscif_rma.h"
+#include "mic/micscif_rma_list.h"
+
+/*
+ * data structure used to sync SCIF_GET_NODE_INFO messaging
+ */
+struct get_node_info {
+ enum micscif_msg_state state;
+ wait_queue_head_t wq;
+};
+
+static inline uint64_t align_low(uint64_t data, uint32_t granularity)
+{
+ return ALIGN(data - (granularity - 1), granularity);
+}
+
+#define SCIF_MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define SCIF_MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+enum endptstate {
+ SCIFEP_CLOSED, // Internal state
+ SCIFEP_UNBOUND, // External state
+ SCIFEP_BOUND, // External state
+ SCIFEP_LISTENING, // External state
+ SCIFEP_CONNECTED, // External state
+ SCIFEP_CONNECTING, // Internal state
+ SCIFEP_MAPPING, // Internal state
+ SCIFEP_CLOSING, // Internal state
+ SCIFEP_CLLISTEN, // Internal state
+ SCIFEP_DISCONNECTED, // Internal state
+ SCIFEP_ZOMBIE // Internal state
+};
+
+extern char *scif_ep_states[];
+
+// Used for coordinating connection accept sequence. This is the data structure
+// for the conlist in the endpoint.
+struct conreq {
+ struct nodemsg msg;
+ struct list_head list;
+};
+
+/* Size of the RB for the Node QP */
+#define NODE_QP_SIZE 0x10000
+/* Size of the RB for the Endpoint QP */
+#define ENDPT_QP_SIZE 0x1000
+
+struct endpt_qp_info {
+ /* Qpair for this endpoint */
+ struct micscif_qp *qp;
+ /*
+ * Physical addr of the QP for Host or
+ * GTT offset of the QP for MIC.
+ * Required for unmapping the QP during close.
+ */
+ dma_addr_t qp_offset;
+ /*
+ * Payload in a SCIF_CNCT_GNT message containing the
+ * physical address of the remote_qp.
+ */
+ dma_addr_t cnct_gnt_payload;
+};
+
+#define SCIFEP_MAGIC 0x5c1f000000005c1f
+
+struct endpt {
+ volatile enum endptstate state;
+ spinlock_t lock;
+
+ struct scif_portID port;
+ struct scif_portID peer;
+
+ int backlog;
+
+ struct endpt_qp_info qp_info;
+ struct endpt_rma_info rma_info;
+ /*
+ * scifdev used by this endpt to communicate with remote node.
+ */
+ struct micscif_dev *remote_dev;
+ uint64_t remote_ep;
+ /*
+ * Keep track of number of connection requests.
+ */
+ int conreqcnt;
+ /*
+ * Cache remote SCIF device state.
+ */
+ enum scif_state sd_state;
+ /*
+ * True if the endpoint was created
+ * via scif_accept(..).
+ */
+ bool accepted_ep;
+ /*
+ * Open file information used to match the id passed
+ * in with the flush routine.
+ */
+ struct files_struct *files;
+ /*
+ * Reference count for functions using this endpoint.
+ */
+ struct kref ref_count;
+ struct list_head conlist;
+ wait_queue_head_t conwq;
+ wait_queue_head_t disconwq;
+ wait_queue_head_t diswq;
+ wait_queue_head_t sendwq;
+ wait_queue_head_t recvwq;
+ struct mutex sendlock;
+ struct mutex recvlock;
+ struct list_head list;
+
+#ifdef CONFIG_MMU_NOTIFIER
+ struct list_head mmu_list;
+#endif
+
+ struct list_head li_accept; /* pending ACCEPTREG */
+ int acceptcnt; /* pending ACCEPTREG cnt */
+ struct list_head liacceptlist; /* link to listen accept */
+ struct list_head miacceptlist; /* link to mi_uaccept */
+ struct endpt *listenep; /* associated listen ep */
+
+ /* Non-blocking connect */
+ struct work_struct conn_work;
+ struct scif_portID conn_port;
+ int conn_err;
+ int conn_async_state;
+ wait_queue_head_t conn_pend_wq;
+ struct list_head conn_list;
+};
+
+static __always_inline void
+micscif_queue_for_cleanup(struct reg_range_t *window, struct list_head *list)
+{
+ struct endpt *ep = (struct endpt *)window->ep;
+ INIT_LIST_HEAD(&window->list_member);
+ window->dma_mark = get_dma_mark(ep->rma_info.dma_chan);
+ spin_lock(&ms_info.mi_rmalock);
+ list_add_tail(&window->list_member, list);
+ spin_unlock(&ms_info.mi_rmalock);
+ queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work);
+}
+
+static __always_inline void
+__micscif_rma_destroy_tcw_helper(struct reg_range_t *window)
+{
+ list_del(&window->list_member);
+ micscif_queue_for_cleanup(window, &ms_info.mi_rma_tc);
+}
+
+void print_ep_state(struct endpt *ep, char *label);
+
+// Function prototypes needed by Unix/Linux drivers linking to scif
+int scif_fdopen(struct file *f);
+int scif_fdclose(struct file *f);
+int scif_process_ioctl(struct file *f, unsigned int cmd, uint64_t arg);
+int micscif_mmap(struct file *file, struct vm_area_struct *vma);
+int scif_mmap(struct vm_area_struct *vma, scif_epd_t epd);
+void scif_munmap(struct vm_area_struct *vma);
+void scif_proc_init(void);
+void scif_proc_cleanup(void);
+int scif_user_send(scif_epd_t epd, void *msg, int len, int flags);
+int scif_user_recv(scif_epd_t epd, void *msg, int len, int flags);
+int __scif_pin_pages(void *addr, size_t len, int *out_prot,
+ int map_flags, scif_pinned_pages_t *pages);
+scif_epd_t __scif_open(void);
+int __scif_bind(scif_epd_t epd, uint16_t pn);
+int __scif_listen(scif_epd_t epd, int backlog);
+int __scif_connect(scif_epd_t epd, struct scif_portID *dst, bool non_block);
+int __scif_accept(scif_epd_t epd, struct scif_portID *peer, scif_epd_t
+*newepd, int flags);
+int __scif_close(scif_epd_t epd);
+int __scif_send(scif_epd_t epd, void *msg, int len, int flags);
+int __scif_recv(scif_epd_t epd, void *msg, int len, int flags);
+off_t __scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
+int prot_flags, int map_flags);
+int __scif_unregister(scif_epd_t epd, off_t offset, size_t len);
+int __scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t
+roffset, int rma_flags);
+int __scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t
+roffset, int rma_flags);
+int __scif_fence_mark(scif_epd_t epd, int flags, int *mark);
+int __scif_fence_wait(scif_epd_t epd, int mark);
+int __scif_fence_signal(scif_epd_t epd, off_t loff, uint64_t lval, off_t roff,
+uint64_t rval, int flags);
+off_t __scif_register_pinned_pages(scif_epd_t epd,
+scif_pinned_pages_t pinned_pages, off_t offset, int map_flags);
+int __scif_get_pages(scif_epd_t epd, off_t offset, size_t len,
+struct scif_range **pages);
+int __scif_put_pages(struct scif_range *pages);
+int __scif_flush(scif_epd_t epd);
+
+void micscif_misc_handler(struct work_struct *work);
+void micscif_conn_handler(struct work_struct *work);
+
+uint16_t rsrv_scif_port(uint16_t port);
+uint16_t get_scif_port(void);
+void put_scif_port(uint16_t port);
+
+void micscif_send_exit(void);
+
+void scif_ref_rel(struct kref *kref_count);
+
+#ifdef _MODULE_SCIF_
+unsigned int micscif_poll(struct file *f, poll_table *wait);
+unsigned int scif_pollfd(struct file *f, poll_table *wait, scif_epd_t epd);
+unsigned int __scif_pollfd(struct file *f, poll_table *wait, struct endpt *ep);
+int micscif_flush(struct file *f, fl_owner_t id);
+#endif
+
+#ifdef _MIC_SCIF_
+void mic_debug_init(void);
+void micscif_get_node_info(void);
+void scif_poll_qp_state(struct work_struct *work);
+#endif
+void mic_debug_uninit(void);
+
+#define serializing_request(x) ((void)*(volatile uint8_t*)(x))
+
+// State list helper functions.
+// Each of these functions must be called with the end point lock unlocked. If
+// the end point is found on the list the end point returned will have its lock
+// set and sflags will return the value to be used to do an unlock_irqrestore
+// at the end of the calling function.
+static inline struct endpt *
+micscif_find_listen_ep(uint16_t port, unsigned long *sflags)
+{
+ struct endpt *ep = NULL;
+ struct list_head *pos, *tmpq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ms_info.mi_eplock, flags);
+ list_for_each_safe(pos, tmpq, &ms_info.mi_listen) {
+ ep = list_entry(pos, struct endpt, list);
+ if (ep->port.port == port) {
+ *sflags = flags;
+ spin_lock(&ep->lock);
+ spin_unlock(&ms_info.mi_eplock);
+ return ep;
+ }
+ }
+ spin_unlock_irqrestore(&ms_info.mi_eplock, flags);
+ return (struct endpt *)NULL;
+}
+
+// Must be called with end point locked
+static inline struct conreq *
+miscscif_get_connection_request(struct endpt *ep, uint64_t payload)
+{
+ struct conreq *conreq;
+ struct list_head *pos, *tmpq;
+
+ list_for_each_safe(pos, tmpq, &ep->conlist) {
+ conreq = list_entry(pos, struct conreq, list);
+ if (conreq->msg.payload[0] == payload) {
+ list_del(pos);
+ ep->conreqcnt--;
+ return conreq;
+ }
+ }
+ return (struct conreq *)NULL;
+}
+
+// There is no requirement for the callee to have the end point
+// locked like other API's above.
+static inline void
+micscif_remove_zombie_ep(struct endpt *ep)
+{
+ struct list_head *pos, *tmpq;
+ unsigned long sflags;
+ struct endpt *tmpep;
+
+ spin_lock_irqsave(&ms_info.mi_eplock, sflags);
+ list_for_each_safe(pos, tmpq, &ms_info.mi_zombie) {
+ tmpep = list_entry(pos, struct endpt, list);
+ if (tmpep == ep) {
+ list_del(pos);
+ ms_info.mi_nr_zombies--;
+ }
+ }
+ spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
+}
+
+static inline void
+micscif_cleanup_zombie_epd(void)
+{
+ struct list_head *pos, *tmpq;
+ unsigned long sflags;
+ struct endpt *ep;
+
+ spin_lock_irqsave(&ms_info.mi_eplock, sflags);
+ list_for_each_safe(pos, tmpq, &ms_info.mi_zombie) {
+ ep = list_entry(pos, struct endpt, list);
+ if (micscif_rma_ep_can_uninit(ep)) {
+ list_del(pos);
+ ms_info.mi_nr_zombies--;
+ va_gen_destroy(&ep->rma_info.va_gen);
+ kfree(ep);
+ }
+ }
+ spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
+}
+
+#define SCIF_WAKE_UP_SEND (1 << 1)
+#define SCIF_WAKE_UP_RECV (1 << 2)
+
+/**
+ * scif_wakeup_ep() - Wake up all clients based on the type
+ * requested i.e. threads blocked in scif_send(..) and/or scif_recv(..).
+ */
+static inline void
+scif_wakeup_ep(int type)
+{
+ struct endpt *ep;
+ unsigned long sflags;
+ struct list_head *pos, *tmpq;
+
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ list_for_each_safe(pos, tmpq, &ms_info.mi_connected) {
+ ep = list_entry(pos, struct endpt, list);
+ if (type & SCIF_WAKE_UP_SEND)
+ wake_up_interruptible(&ep->sendwq);
+ if (type & SCIF_WAKE_UP_RECV)
+ wake_up_interruptible(&ep->recvwq);
+ }
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+}
+
+/*
+ * is_self_scifdev:
+ * @dev: The remote SCIF Device
+ *
+ * Returns true if the SCIF Device passed is the self aka Loopback SCIF device.
+ */
+static inline int is_self_scifdev(struct micscif_dev *dev)
+{
+ return dev->sd_node == ms_info.mi_nodeid;
+}
+
+/*
+ * is_p2p_scifdev:
+ * @dev: The remote SCIF Device
+ *
+ * Returns true if the SCIF Device is a MIC Peer to Peer SCIF device.
+ */
+static inline bool is_p2p_scifdev(struct micscif_dev *dev)
+{
+#ifdef _MIC_SCIF_
+ return dev != &scif_dev[SCIF_HOST_NODE] && !is_self_scifdev(dev);
+#else
+ return false;
+#endif
+}
+
+/*
+ * get_conn_count:
+ * @dev: The remote SCIF Device
+ *
+ * Increments the number of active SCIF connections. Callee is expected
+ * to synchronize calling this API with put_conn_count.
+ */
+static __always_inline void
+get_conn_count(struct micscif_dev *dev)
+{
+ dev->num_active_conn++;
+}
+
+/*
+ * put_conn_count:
+ * @dev: The remote SCIF Device
+ *
+ * Decrements the number of active connections. Callee is expected
+ * to synchronize calling this API with get_conn_count.
+ */
+static __always_inline void
+put_conn_count(struct micscif_dev *dev)
+{
+ dev->num_active_conn--;
+ BUG_ON(dev->num_active_conn < 0);
+}
+
+/*
+ * get_kref_count:
+ * epd: SCIF endpoint
+ *
+ * Increments kmod endpoint reference count. Callee is expected
+ * to synchronize calling this API with put_kref_count.
+ */
+static __always_inline void
+get_kref_count(scif_epd_t epd)
+{
+ kref_get(&(epd->ref_count));
+}
+
+/*
+ * put_kref_count:
+ * epd: SCIF endpoint
+ *
+ * Decrements kmod endpoint reference count. Callee is expected
+ * to synchronize calling this API with get_kref_count.
+ */
+static __always_inline void
+put_kref_count(scif_epd_t epd)
+{
+ kref_put(&(epd->ref_count), scif_ref_rel);
+}
+
+/*
+ * is_scifdev_alive:
+ * @dev: The remote SCIF Device
+ *
+ * Returns true if the remote SCIF Device is running or sleeping for
+ * this endpoint.
+ */
+static inline int scifdev_alive(struct endpt *ep)
+{
+ return (((SCIFDEV_RUNNING == ep->remote_dev->sd_state) ||
+ (SCIFDEV_SLEEPING == ep->remote_dev->sd_state)) &&
+ SCIFDEV_RUNNING == ep->sd_state);
+}
+
+/*
+ * verify_epd:
+ * ep: SCIF endpoint
+ *
+ * Checks several generic error conditions and returns the
+ * appropiate error.
+ */
+static inline int verify_epd(struct endpt *ep)
+{
+ if (ep->state == SCIFEP_DISCONNECTED)
+ return -ECONNRESET;
+
+ if (ep->state != SCIFEP_CONNECTED)
+ return -ENOTCONN;
+
+ if (!scifdev_alive(ep))
+ return -ENODEV;
+
+ return 0;
+}
+
+/**
+ * scif_invalidate_ep() - Set remote SCIF device state for all connected
+ * and disconnected endpoints for a particular node to SCIFDEV_STOPPED,
+ * change endpoint state to disconnected and wake up all send/recv/con
+ * waitqueues.
+ */
+static inline void
+scif_invalidate_ep(int node)
+{
+ struct endpt *ep;
+ unsigned long sflags;
+ struct list_head *pos, *tmpq;
+
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) {
+ ep = list_entry(pos, struct endpt, list);
+ if (ep->remote_dev->sd_node == node) {
+ spin_lock(&ep->lock);
+ ep->sd_state = SCIFDEV_STOPPED;
+ spin_unlock(&ep->lock);
+ }
+ }
+ list_for_each_safe(pos, tmpq, &ms_info.mi_connected) {
+ ep = list_entry(pos, struct endpt, list);
+ if (ep->remote_dev->sd_node == node) {
+ list_del(pos);
+ put_conn_count(ep->remote_dev);
+ spin_lock(&ep->lock);
+ ep->state = SCIFEP_DISCONNECTED;
+ list_add_tail(&ep->list, &ms_info.mi_disconnected);
+ ep->sd_state = SCIFDEV_STOPPED;
+ wake_up_interruptible(&ep->sendwq);
+ wake_up_interruptible(&ep->recvwq);
+ wake_up_interruptible(&ep->conwq);
+ spin_unlock(&ep->lock);
+ }
+ }
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+ flush_workqueue(ms_info.mi_conn_wq);
+}
+
+/*
+ * Only Debug Functions Below
+ */
+#define SCIF_CRUMB pr_debug("%s %d\n", __func__, __LINE__)
+
+static inline void
+micscif_display_all_zombie_ep(void)
+{
+ struct list_head *pos, *tmpq;
+ unsigned long sflags;
+ struct endpt *ep;
+
+ pr_debug("Zombie Info Start\n");
+ spin_lock_irqsave(&ms_info.mi_eplock, sflags);
+ list_for_each_safe(pos, tmpq, &ms_info.mi_zombie) {
+ ep = list_entry(pos, struct endpt, list);
+ if (!list_empty(&ep->rma_info.reg_list))
+ micscif_display_all_windows(&ep->rma_info.reg_list);
+ if (!list_empty(&ep->rma_info.remote_reg_list))
+ micscif_display_all_windows(
+ &ep->rma_info.remote_reg_list);
+ }
+ spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
+ pr_debug("Zombie Info End\n");
+}
+
+static inline void dump_ep(scif_epd_t epd, const char *func, int line)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ pr_debug("%s %d state %d lock %p port.node 0x%x"
+ "port.port 0x%x peer.node 0x%x peer.port 0x%x backlog %d qp %p"
+ "qp_offset 0x%llx cnct_gnt_payload 0x%llx remote_dev %p\n",
+ func, line, ep->state, &ep->lock, ep->port.node,
+ ep->port.port, ep->peer.node, ep->peer.port, ep->backlog,
+ ep->qp_info.qp, ep->qp_info.qp_offset,
+ ep->qp_info.cnct_gnt_payload, ep->remote_dev);
+}
+
+static inline void dump_qp(volatile struct micscif_qp *qp, const char *func, int line)
+{
+ pr_debug("%s %d qp %p local_buf 0x%llx"
+ " local_qp 0x%llx remote_buf 0x%llx remote_qp %p ep 0x%llx\n",
+ func, line, qp, qp->local_buf,
+ qp->local_qp, qp->remote_buf, qp->remote_qp, qp->ep);
+}
+
+static inline void dump_rb(struct micscif_rb *rb, const char *func, int line)
+{
+ pr_debug("%s %d rb %p rb_base %p *read_ptr 0x%x"
+ " *write_ptr 0x%x size 0x%x"
+ " cro 0x%x cwo 0x%x ocro 0x%x ocwo 0x%x\n",
+ func, line, rb, rb->rb_base, *rb->read_ptr,
+ *rb->write_ptr, rb->size, rb->current_read_offset,
+ rb->current_write_offset,
+ rb->old_current_read_offset,
+ rb->old_current_write_offset);
+}
+
+#endif /* MICSCIF_H */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MICSCIF_INTR_H
+#define MICSCIF_INTR_H
+#define SBOX_SDBIC0_DBSTAT_BIT 0x40000000
+#define SBOX_SDBIC0_DBREQ_BIT 0x80000000
+
+/* RDMASR Info */
+#define RDMASR_IRQ_BASE 17
+#define get_rdmasr_irq(m) ((RDMASR_IRQ_BASE) + (m))
+#define get_rdmasr_offset(m) (((m) << 2) + (SBOX_RDMASR0))
+
+#ifdef _MIC_SCIF_
+int register_scif_intr_handler(struct micscif_dev *dev);
+void deregister_scif_intr_handler(struct micscif_dev *dev);
+#endif
+int micscif_setup_interrupts(struct micscif_dev *dev);
+void micscif_destroy_interrupts(struct micscif_dev *scifdev);
+#endif /* MICSCIF_INTR_H */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MIC_KMEM_CACHE_H
+#define MIC_KMEM_CACHE_H
+#define MAX_UNALIGNED_BUF_SIZE (1024 * 1024ULL)
+#define KMEM_UNALIGNED_BUF_SIZE (MAX_UNALIGNED_BUF_SIZE + (L1_CACHE_BYTES << 1))
+#include<linux/slab.h>
+extern struct kmem_cache *unaligned_cache;
+
+static inline void micscif_kmem_cache_free(void *buffer)
+{
+ kmem_cache_free(unaligned_cache, buffer);
+}
+
+static inline void *micscif_kmem_cache_alloc(void)
+{
+ return kmem_cache_alloc(unaligned_cache, GFP_KERNEL|GFP_ATOMIC);
+}
+
+static inline struct kmem_cache *micscif_kmem_cache_create(void)
+{
+ return kmem_cache_create("Unaligned_DMA", KMEM_UNALIGNED_BUF_SIZE, 0, SLAB_HWCACHE_ALIGN, NULL);
+}
+
+static inline void micscif_kmem_cache_destroy(void)
+{
+ kmem_cache_destroy(unaligned_cache);
+}
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MICSCIF_MAP_H
+#define MICSCIF_MAP_H
+
+static __always_inline
+void *get_local_va(off_t off, struct reg_range_t *window, size_t len)
+{
+ struct page **pages = window->pinned_pages->pages;
+
+ uint64_t page_nr = ((off - window->offset) >> PAGE_SHIFT);
+
+ off_t page_off = off & ~PAGE_MASK;
+
+ return (void *)((uint64_t)
+ (page_address(pages[page_nr])) | page_off);
+}
+
+static __always_inline void
+scif_iounmap(void *virt, size_t len, struct micscif_dev *dev)
+{
+#ifdef _MIC_SCIF_
+ if (!is_self_scifdev(dev))
+ iounmap(virt);
+#endif
+}
+
+#ifdef _MIC_SCIF_
+/* FIXME: fix the documentation and functions names since these are also
+ * used in p2p
+ */
+/*
+ * Maps the VA passed in local to the aperture and returns the
+ * corresponding GTT index in offset by reference.
+ * In the loopback case simply return the physical address.
+ */
+static __always_inline int
+map_virt_into_aperture(phys_addr_t *out_offset,
+ void *local,
+ struct micscif_dev *dev,
+ size_t size)
+{
+ if (is_self_scifdev(dev))
+ *out_offset = virt_to_phys(local);
+ else {
+ /* Error unwinding code relies on return value being zero */
+ *out_offset = virt_to_phys(local);
+ if (dev != &scif_dev[0])
+ *out_offset = *out_offset + dev->sd_base_addr;
+ }
+
+ return 0;
+}
+
+/*
+ * Maps the struct page passed in page to the aperture and returns the
+ * corresponding GTT index in offset by reference.
+ * In the loopback case simply return the physical address.
+ */
+static __always_inline int
+map_page_into_aperture(phys_addr_t *out_offset,
+ struct page *page,
+ struct micscif_dev *dev)
+{
+ if (is_self_scifdev(dev))
+ *out_offset = page_to_phys(page);
+ else {
+ /* Error unwinding code relies on return value being zero */
+ *out_offset = page_to_phys(page);
+ if (dev != &scif_dev[0])
+ *out_offset = *out_offset + dev->sd_base_addr;
+ }
+ return 0;
+}
+
+/*
+ * Nothing to do on card side
+ */
+static __always_inline void
+unmap_from_aperture(phys_addr_t local,
+ struct micscif_dev *dev,
+ size_t size)
+{
+}
+
+/*
+ * Maps Host physical address passed in phys to MIC.
+ * In the loopback case simply return the VA from the PA.
+ */
+static __always_inline void *
+scif_ioremap(phys_addr_t phys, size_t size, struct micscif_dev *dev)
+{
+ void *out_virt;
+
+ if (is_self_scifdev(dev))
+ out_virt = phys_to_virt(phys);
+ else
+ out_virt = ioremap_nocache(phys, size);
+
+ return out_virt;
+}
+
+/*
+ * Get the system physical address from the physical address passed
+ * by the host. In the case of loopback simply return the physical
+ * address.
+ */
+static __always_inline phys_addr_t
+get_phys_addr(phys_addr_t phys, struct micscif_dev *dev)
+{
+ return phys;
+}
+
+#else /* !_MIC_SCIF_ */
+/*
+ * Maps the VA passed in local to the aperture and returns the
+ * corresponding physical address in offset.
+ * In the loopback case simply return the physical address.
+ */
+static __always_inline int
+map_virt_into_aperture(phys_addr_t *out_offset,
+ void *local,
+ struct micscif_dev *dev,
+ size_t size)
+{
+ int err = 0;
+ int bid;
+ struct pci_dev *hwdev;
+
+ if (is_self_scifdev(dev))
+ *(out_offset) = virt_to_phys((local));
+ else {
+
+ bid = dev->sd_node - 1;
+ hwdev = get_per_dev_ctx(bid)->bi_pdev;
+ *out_offset = mic_map_single(bid, hwdev, local, size);
+ if (mic_map_error(*out_offset))
+ err = -ENOMEM;
+ }
+
+ if (err)
+ *out_offset = 0;
+
+ return err;
+}
+/*
+ * Maps the struct page passed in page to the aperture and returns the
+ * corresponding physical address in offset.
+ * In the loopback case simply return the physical address.
+ */
+static __always_inline int
+map_page_into_aperture(phys_addr_t *out_offset,
+ struct page *page,
+ struct micscif_dev *dev)
+{
+ int err = 0;
+ int bid;
+ dma_addr_t mic_addr;
+ struct pci_dev *hwdev;
+
+ if (is_self_scifdev(dev))
+ *out_offset = page_to_phys(page);
+ else {
+
+ bid = dev->sd_node - 1;
+ hwdev = get_per_dev_ctx(bid)->bi_pdev;
+
+ *out_offset = pci_map_page(hwdev, page, 0x0, PAGE_SIZE,
+ PCI_DMA_BIDIRECTIONAL);
+ if (pci_dma_mapping_error(hwdev, *out_offset)) {
+ err = -EINVAL;
+ } else {
+ if (!(mic_addr = mic_map(bid, *out_offset, PAGE_SIZE))) {
+ printk(KERN_ERR "mic_map failed board id %d\
+ addr %#016llx size %#016zx\n",
+ bid, *out_offset, PAGE_SIZE);
+ pci_unmap_single(hwdev, *out_offset,
+ PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+ err = -EINVAL;
+ } else
+ *out_offset = mic_addr;
+ }
+ }
+
+ if (err)
+ *out_offset = 0;
+
+ return err;
+}
+
+/*
+ * Unmaps the physical address passed in lo/al from the PCIe aperture.
+ * Nothing to do in the loopback case.
+ */
+static __always_inline void
+unmap_from_aperture(phys_addr_t local,
+ struct micscif_dev *dev,
+ size_t size)
+{
+
+ if (!is_self_scifdev(dev))
+ mic_ctx_unmap_single(get_per_dev_ctx(dev->sd_node - 1),
+ local, size);
+}
+
+/*
+ * TODO: I'm thinking maybe we should take the apt_phys offset off of this macro
+ * and have it be outside ...
+ * Maps the page corresponding to the GTT offset passed in phys.
+ * In the loopback case simply return the VA from the PA.
+ */
+static __always_inline void *
+scif_ioremap(phys_addr_t phys, size_t size, struct micscif_dev *dev)
+{
+ void *out_virt;
+
+ if (is_self_scifdev(dev))
+ out_virt = phys_to_virt(phys);
+ else {
+ out_virt = get_per_dev_ctx(dev->sd_node - 1)->aper.va + phys;
+ }
+ return out_virt;
+}
+
+static __always_inline phys_addr_t
+get_phys_addr(phys_addr_t phys, struct micscif_dev *dev)
+{
+ phys_addr_t out_phys;
+
+ if (is_self_scifdev(dev))
+ out_phys = phys;
+ else {
+ phys_addr_t __apt_base =
+ (phys_addr_t)get_per_dev_ctx(dev->sd_node - 1)->aper.pa;
+ out_phys = phys + __apt_base;
+ }
+
+ return out_phys;
+}
+
+#endif /* !_MIC_SCIF_ */
+
+#endif /* MICSCIF_MAP_H */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MICSCIF_NM_H
+#define MICSCIF_NM_H
+
+#include <scif.h>
+
+#ifdef MIC_IS_EMULATION
+#define DEFAULT_WATCHDOG_TO (INT_MAX)
+#define NODE_ALIVE_TIMEOUT (INT_MAX)
+#define NODE_QP_TIMEOUT (INT_MAX)
+#define NODE_ACCEPT_TIMEOUT (INT_MAX)
+#define NODEQP_SEND_TO_MSEC (INT_MAX)
+#else
+#define DEFAULT_WATCHDOG_TO (30)
+#define NODE_ALIVE_TIMEOUT (ms_info.mi_watchdog_to * HZ)
+#define NODE_QP_TIMEOUT (100)
+#define NODE_ACCEPT_TIMEOUT (3 * HZ)
+#define NODEQP_SEND_TO_MSEC (3 * 1000)
+#endif
+
+#define SCIF_ENABLE_PM 1
+
+#define DESTROY_WQ (true)
+
+enum disconn_type {
+ DISCONN_TYPE_POWER_MGMT,
+ DISCONN_TYPE_LOST_NODE,
+ DISCONN_TYPE_MAINTENANCE_MODE,
+};
+
+/*
+ * Notify the host about a new dependency with the remote SCIF device.
+ * Dependencies are created during scif_mmap()/scif_get_pages().
+ */
+void micscif_create_node_dep(struct micscif_dev *dev, int nr_pages);
+
+/*
+ * Notify the host that an existing dependency with the remote SCIF
+ * device no longer exists.
+ */
+void micscif_destroy_node_dep(struct micscif_dev *dev, int nr_pages);
+
+/**
+ * micscif_inc_node_refcnt:
+ *
+ * @dev: Remote SCIF device.
+ * @count: ref count
+ *
+ * Increment the global activity ref count for the remote SCIF device.
+ * If the remote SCIF device is idle, then notify the host to wake up
+ * the remote SCIF device and then wait for an ACK.
+ */
+static __always_inline void
+micscif_inc_node_refcnt(struct micscif_dev *dev, long cnt)
+{
+#ifdef SCIF_ENABLE_PM
+ if (unlikely(dev && !atomic_long_add_unless(&dev->scif_ref_cnt,
+ cnt, SCIF_NODE_IDLE))) {
+ /*
+ * This code path would not be entered unless the remote
+ * SCIF device has actually been put to sleep by the host.
+ */
+ mutex_lock(&dev->sd_lock);
+ if (SCIFDEV_STOPPED == dev->sd_state ||
+ SCIFDEV_STOPPING == dev->sd_state ||
+ SCIFDEV_INIT == dev->sd_state)
+ goto bail_out;
+ if (test_bit(SCIF_NODE_MAGIC_BIT,
+ &dev->scif_ref_cnt.counter)) {
+ /* Notify host that the remote node must be woken */
+ struct nodemsg notif_msg;
+
+ dev->sd_wait_status = OP_IN_PROGRESS;
+ notif_msg.uop = SCIF_NODE_WAKE_UP;
+ notif_msg.src.node = ms_info.mi_nodeid;
+ notif_msg.dst.node = SCIF_HOST_NODE;
+ notif_msg.payload[0] = dev->sd_node;
+ /* No error handling for Host SCIF device */
+ micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE],
+ ¬if_msg, NULL);
+ /*
+ * A timeout is not required since only the cards can
+ * initiate this message. The Host is expected to be alive.
+ * If the host has crashed then so will the cards.
+ */
+ wait_event(dev->sd_wq,
+ dev->sd_wait_status != OP_IN_PROGRESS);
+ /*
+ * Aieee! The host could not wake up the remote node.
+ * Bail out for now.
+ */
+ if (dev->sd_wait_status == OP_COMPLETED) {
+ dev->sd_state = SCIFDEV_RUNNING;
+ clear_bit(SCIF_NODE_MAGIC_BIT,
+ &dev->scif_ref_cnt.counter);
+ }
+ }
+ /* The ref count was not added if the node was idle. */
+ atomic_long_add(cnt, &dev->scif_ref_cnt);
+bail_out:
+ mutex_unlock(&dev->sd_lock);
+ }
+#endif
+}
+
+/**
+ * micscif_dec_node_refcnt:
+ *
+ * @dev: Remote SCIF device.
+ * @nr_pages: number of pages
+ *
+ * Decrement the global activity ref count for the remote SCIF device.
+ * Assert if the ref count drops to negative.
+ */
+static __always_inline void
+micscif_dec_node_refcnt(struct micscif_dev *dev, long cnt)
+{
+#ifdef SCIF_ENABLE_PM
+ if (dev) {
+ if (unlikely((atomic_long_sub_return(cnt,
+ &dev->scif_ref_cnt)) < 0)) {
+ printk(KERN_ERR "%s %d dec dev %p node %d ref %ld "
+ " caller %p Lost Node?? \n",
+ __func__, __LINE__, dev, dev->sd_node,
+ atomic_long_read(&dev->scif_ref_cnt),
+ __builtin_return_address(0));
+ atomic_long_add_unless(&dev->scif_ref_cnt, cnt,
+ SCIF_NODE_IDLE);
+ }
+ }
+#endif
+}
+
+/* Handle a SCIF_NODE_REMOVE message */
+uint64_t micscif_handle_remove_node(uint64_t mask, uint64_t flags);
+void micscif_cleanup_scifdev(struct micscif_dev *dev, bool destroy_wq);
+
+void micscif_node_add_callback(int node);
+
+void set_nodemask_bit(uint8_t* nodemask, uint32_t node_id, int val);
+int get_nodemask_bit(uint8_t* nodemask, uint32_t node_id);
+
+#ifndef _MIC_SCIF_
+
+/* definition of stack node used in activation/deactivation set algorithms*/
+struct stack_node {
+ struct list_head next;
+ uint32_t node_id;
+};
+
+enum dependency_state {
+ DEP_STATE_NOT_DEPENDENT,
+ DEP_STATE_DEPENDENT,
+ DEP_STATE_DISCONNECT_READY,
+ DEP_STATE_DISCONNECTED
+};
+
+
+uint64_t micscif_send_pm_rmnode_msg(int node, uint64_t nodemask_addr,
+ uint64_t nodemask_size, int orig_node);
+uint64_t micscif_send_lost_node_rmnode_msg(int node, int orig_node);
+
+/* definitions of stack methods used in activation/deactivation set algorithms */
+int init_depgraph_stack(struct list_head *stack_ptr);
+int uninit_depgraph_stack(struct list_head *stack_ptr);
+int is_stack_empty(struct list_head *stack_ptr);
+int stack_push_node(struct list_head *stack_ptr, uint32_t node_id);
+int stack_pop_node(struct list_head *stack_ptr, uint32_t *node_id);
+int micscif_get_activeset(uint32_t node_id, uint8_t *nodemask);
+int micscif_get_minimal_deactiveset(uint32_t node_id, uint8_t *nodemask, uint8_t *visited);
+int micscif_get_deactiveset(uint32_t node_id, uint8_t *nodemask, int max_possible);
+void micscif_update_p2p_state(uint32_t node_id, uint32_t peer_id, enum scif_state state);
+
+/* Method responsible for disconnecting node from the scif network */
+int micscif_disconnect_node(uint32_t node_id, uint8_t *nodemask, enum disconn_type type);
+int micscif_connect_node(uint32_t node_id, bool get_ref);
+
+void micscif_set_nodedep(uint32_t src_node, uint32_t dst_node, enum dependency_state state);
+enum dependency_state micscif_get_nodedep(uint32_t src_node, uint32_t dst_node);
+uint64_t micscif_send_node_alive(int node);
+void micscif_watchdog_handler(struct work_struct *work);
+int micscif_handle_lostnode(uint32_t nodeid);
+#endif /*_MIC_SCIF_*/
+
+/* SCIF tasks before transition to low power state */
+int micscif_suspend_handler(struct notifier_block *notif,
+ unsigned long event, void *ptr);
+
+/*
+ * SCIF tasks if a previous low power state transition
+ * has failed after a suspend call.
+ */
+int micscif_fail_suspend_handler(struct notifier_block *notif,
+ unsigned long event, void *ptr);
+
+/* SCIF tasks after wake up from low powe state */
+int micscif_resume_handler(struct notifier_block *notif,
+ unsigned long event, void *ptr);
+
+#endif /* MICSCIF_NM_H */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MICSCIF_NODEQP
+#define MICSCIF_NODEQP
+
+#include "micscif_rb.h"
+
+ /* Payload Description */
+#define SCIF_INIT 1 /* Address of node's node First message sent by a node to
+ * array the host, and host to node
+ */
+#define SCIF_EXIT 2 /* Last message telling the host the driver is exiting */
+#define SCIF_NODE_ADD 3 /* Tell Online nodes a new node exits */
+#define SCIF_NODE_ADD_ACK 4 /* Confirm to host sequence is finished TODO Needed??? */
+#define SCIF_CNCT_REQ 5 /* Phys addr of Request connection to a port */
+#define SCIF_CNCT_GNT 6 /* Phys addr of new Grant connection request */
+#define SCIF_CNCT_GNTACK 7 /* Error type Reject a connection request */
+#define SCIF_CNCT_GNTNACK 8 /* Error type Reject a connection request */
+#define SCIF_CNCT_REJ 9 /* Error type Reject a connection request */
+#define SCIF_CNCT_TERM 10 /* Terminate type Terminate a connection request */
+#define SCIF_TERM_ACK 11 /* Terminate type Terminate a connection request */
+#define SCIF_DISCNCT 12 /* Notify peer that connection is being terminated */
+#define SCIF_DISCNT_ACK 13 /* Notify peer that connection is being terminated */
+#define SCIF_REGISTER 14 /* Tell peer about a new registered window */
+#define SCIF_REGISTER_ACK 15 /* Notify peer about unregistration success */
+#define SCIF_REGISTER_NACK 16 /* Notify peer about registration success */
+#define SCIF_UNREGISTER 17 /* Tell peer about unregistering a registered window */
+#define SCIF_UNREGISTER_ACK 18 /* Notify peer about registration failure */
+#define SCIF_UNREGISTER_NACK 19 /* Notify peer about unregistration failure */
+#define SCIF_ALLOC_REQ 20 /* Request a mapped buffer */
+#define SCIF_ALLOC_GNT 21 /* Notify peer about allocation success */
+#define SCIF_ALLOC_REJ 22 /* Notify peer about allocation failure */
+#define SCIF_FREE_PHYS 23 /* Free previously allocated GTT/PCI mappings */
+#define SCIF_FREE_VIRT 24 /* Free previously allocated virtual memory */
+#define SCIF_CLIENT_SENT 25 /* Notify the peer that a data message has been written to the RB */
+#define SCIF_CLIENT_RCVD 26 /* Notify the peer that a data message has been read from the RB */
+#define SCIF_MUNMAP 27 /* Acknowledgment for a SCIF_MMAP request */
+#define SCIF_MARK 28 /* SCIF Remote Fence Mark Request */
+#define SCIF_MARK_ACK 29 /* SCIF Remote Fence Mark Success */
+#define SCIF_MARK_NACK 30 /* SCIF Remote Fence Mark Failure */
+#define SCIF_WAIT 31 /* SCIF Remote Fence Wait Request */
+#define SCIF_WAIT_ACK 32 /* SCIF Remote Fence Wait Success */
+#define SCIF_WAIT_NACK 33 /* SCIF Remote Fence Wait Failure */
+#define SCIF_SIG_LOCAL 34 /* SCIF Remote Fence Local Signal Request */
+#define SCIF_SIG_REMOTE 35 /* SCIF Remote Fence Remote Signal Request */
+#define SCIF_SIG_ACK 36 /* SCIF Remote Fence Remote Signal Success */
+#define SCIF_SIG_NACK 37 /* SCIF Remote Fence Remote Signal Failure */
+#define SCIF_NODE_CREATE_DEP 42 /* Notify the Host that a new dependency is
+ * being created between two nodes
+ */
+#define SCIF_NODE_DESTROY_DEP 43 /* Notify the Host that an existing dependency is
+ * being destroyed between two nodes
+ */
+#define SCIF_NODE_REMOVE 44 /* Request to deactivate a set of remote SCIF nodes */
+#define SCIF_NODE_REMOVE_ACK 45 /* Response to a SCIF_NODE_REMOVE message */
+#define SCIF_NODE_WAKE_UP 46 /* Notification to the Host to wake up a remote node */
+#define SCIF_NODE_WAKE_UP_ACK 47 /* Response to SCIF_NODE_WAKE_UP message */
+#define SCIF_NODE_WAKE_UP_NACK 48 /* Response to SCIF_NODE_WAKE_UP message. Think Lost Node */
+#define SCIF_NODE_ALIVE 49 /* Check if kn* card is alive */
+#define SCIF_NODE_ALIVE_ACK 50 /* ACK the for above message */
+#define SMPT_SET 51 /* Add a smpt entry */
+#define SCIF_PROXY_DMA 56 /* Proxies DMA read requests to peer for performance */
+#define SCIF_PROXY_ORDERED_DMA 57 /* Proxies DMA read requests to peer for performance */
+#define SCIF_NODE_CONNECT 58 /* Setup a p2p connection b/w two nodes */
+#define SCIF_NODE_CONNECT_NACK 59 /* p2p connection is not successful */
+#define SCIF_NODE_ADD_NACK 60 /* SCIF_NODE_ADD failed report to the waiting thread(s) */
+#define SCIF_GET_NODE_INFO 61 /* Get current node mask from the host*/
+#define SCIF_TEST 62 /* Test value Used for test only */
+#define SCIF_MAX_MSG SCIF_TEST
+
+
+/*
+ * The *only* reason we need 2 uint64_t for payload
+ * right now is because the SCIF_CNCT_GNT message needs
+ * to send across both the QP offset and the QP id.
+ *
+ * Now we have to increase this to 3 uint64_t because
+ * the Alloc message requires the remote EP, allocation size
+ * and the allocation handle.
+ *
+ * Increased to 4 uint64_t because SCIF_FENCE requires
+ * ep, offset, len and the waitqueue pointer to wake up.
+ */
+struct nodemsg {
+ struct scif_portID src;
+ struct scif_portID dst;
+ uint32_t uop;
+ uint64_t payload[4];
+} __attribute__ ((packed));
+
+
+/*
+ * Generic state used for certain node QP message exchanges
+ * like Unregister, Alloc etc.
+ */
+enum micscif_msg_state {
+ OP_IDLE = 1,
+ OP_IN_PROGRESS,
+ OP_COMPLETED,
+ OP_FAILED
+};
+
+/*
+ * Generic structure used for exchanging ALLOC_REQ/GNT messages.
+ */
+struct allocmsg {
+ dma_addr_t phys_addr;
+ void *vaddr;
+ uint32_t uop;
+ size_t size;
+ enum micscif_msg_state state;
+ wait_queue_head_t allocwq;
+};
+
+/* Interesting structure -- a little difficult because we can only
+ * write across the PCIe, so any r/w pointer we need to read is
+ * local. We only need to read the read pointer on the inbound_q
+ * and read the write pointer in the outbound_q
+ */
+struct micscif_qp {
+ uint64_t ep;
+ uint64_t magic;
+ uint64_t blast;
+#define SCIFEP_MAGIC 0x5c1f000000005c1f
+ struct micscif_rb outbound_q;
+ struct micscif_rb inbound_q;
+ /* FIXME cache align local_write/read */
+ uint32_t local_write; /* For local inbound */
+ uint32_t local_read; /* For local outbound */
+ volatile struct micscif_qp *remote_qp;
+ dma_addr_t local_buf; /* Local BS */
+ dma_addr_t local_qp;
+ dma_addr_t remote_buf; /* Remote BS */
+ volatile uint32_t qp_state;
+#define QP_OFFLINE 0xdead
+#define QP_ONLINE 0xc0de
+ uint16_t scif_version;
+ spinlock_t qp_send_lock;
+ spinlock_t qp_recv_lock;
+};
+
+/*
+ * An element in the loopback Node QP message list.
+ */
+struct loopb_msg {
+ struct nodemsg msg;
+ struct list_head list_member;
+};
+
+struct micscif_qp *micscif_nodeqp_nextmsg(struct micscif_dev *scifdev);
+int micscif_nodeqp_send(struct micscif_dev *scifdev, struct nodemsg *msg, struct endpt *ep);
+int micscif_nodeqp_intrhandler(struct micscif_dev *scifdev, struct micscif_qp *qp);
+int micscif_loopb_msg_handler(struct micscif_dev *scifdev, struct micscif_qp *qp);
+
+// Card side only functions
+int micscif_setup_card_qp(phys_addr_t host_phys, struct micscif_dev *dev);
+
+int micscif_setuphost_response(struct micscif_dev *scifdev, uint64_t payload);
+int micscif_setup_qp_connect(struct micscif_qp *qp, dma_addr_t *qp_offset, int local_size, struct micscif_dev *scifdev);
+int micscif_setup_qp_accept(struct micscif_qp *qp, dma_addr_t *qp_offset, dma_addr_t phys, int local_size, struct micscif_dev *scifdev);
+int micscif_setup_qp_connect_response(struct micscif_dev *scifdev, struct micscif_qp *qp, uint64_t payload);
+int micscif_setup_loopback_qp(struct micscif_dev *scifdev);
+int micscif_destroy_loopback_qp(struct micscif_dev *scifdev);
+void micscif_teardown_ep(void *endpt);
+void micscif_add_epd_to_zombie_list(struct endpt *ep, bool mi_eplock_held);
+
+#endif /* MICSCIF_NODEQP */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef _SCIF_RING_BUFFER_DEFINE
+#define _SCIF_RING_BUFFER_DEFINE
+
+/*
+ * This describes a general purpose, byte based
+ * ring buffer. It handles multiple readers or
+ * writers using a lock -- it is lockless between
+ * producer and consumer (so it can handle being
+ * used across the PCIe bus).
+ */
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+
+/**
+ * This version is used to ensure component compatibility between the host and
+ * card driver modules that use the ring buffer functions. This version should
+ * be incremented whenever there is a change to the ring buffer module that
+ * affects the functionality of the ring buffer.
+ */
+#define RING_BUFFER_VERSION 1
+
+/* Two of these actually form a single queue -- one on each side of the PCIe
+ * bus
+ *
+ * NOTE! This only works if the queue (pointed to at rb_base) exists in the
+ * consumer's memory. The code does not do any wbinvd after writing to the
+ * buffer, which assumes that the memory is not cached on the writers side.
+ *
+ * If this structure were to be used across the PCIe bus with the buffer
+ * living on the other side of the bus, it wouldn't work (would require a
+ * wbinvd or use the linux dma streaming buffer API)
+ */
+struct micscif_rb {
+ volatile void *rb_base;
+ volatile uint32_t *read_ptr; /* Points to the read offset */
+ volatile uint32_t *write_ptr; /* Points to the write offset */
+ uint32_t size;
+ uint32_t current_read_offset; /* cache it to improve performance */
+ uint32_t current_write_offset; /* cache it to improve performance */
+ uint32_t old_current_read_offset;
+ uint32_t old_current_write_offset;
+};
+
+/**
+ * methods used by both
+ */
+void micscif_rb_init(struct micscif_rb *rb, volatile uint32_t *read_ptr,
+ volatile uint32_t *write_ptr, volatile void *rb_base,
+ const uint32_t size);
+
+/**
+ * writer-only methods
+ */
+/*
+ * write a new command, then micscif_rb_commit()
+ */
+int micscif_rb_write(struct micscif_rb *rb, void *msg, uint32_t size);
+/*
+ * After write(), then micscif_rb_commit()
+ */
+void micscif_rb_commit(struct micscif_rb *rb);
+/*
+ * used on power state change to reset cached pointers
+ */
+void micscif_rb_reset(struct micscif_rb *rb);
+
+/*
+ * Query space available for writing to a RB.
+ */
+int micscif_rb_space(struct micscif_rb *rb);
+/**
+ * reader-only methods
+ */
+/*
+ * uses (updates) the cached read pointer to get the next command,
+ * so writer doesnt see the command as consumed.
+ *
+ * Returns number of bytes read
+ *
+ * Size is IN -- the caller passes in a size (the max size that
+ * the function will read out)
+ *
+ * msg is OUT, but the caller is responsible for allocating space to
+ * read into. The max size this function will read is what is passed
+ * into by size, so the buffer pointer to by msg MUST be at least size
+ * bytes long.
+ */
+int micscif_rb_get_next (struct micscif_rb *rb, void *msg, uint32_t size);
+
+/*
+ * updates the control block read pointer,
+ * which will be visible to the writer so it can re-use the space
+ */
+void micscif_rb_update_read_ptr(struct micscif_rb *rb);
+
+/*
+ * Count the number of empty slots in the RB
+ */
+uint32_t micscif_rb_count(struct micscif_rb *rb, uint32_t size);
+
+/**
+ * Return the ring buffer module version.
+ */
+uint16_t micscif_rb_get_version(void);
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MICSCIF_RMA_H
+#define MICSCIF_RMA_H
+
+#ifdef CONFIG_MMU_NOTIFIER
+#include <linux/mmu_notifier.h>
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#include <linux/huge_mm.h>
+#endif
+#ifdef CONFIG_HUGETLB_PAGE
+#include <linux/hugetlb.h>
+#endif
+#endif
+#include "scif.h"
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+#include "mic/micscif_kmem_cache.h"
+
+struct rma_mmu_notifier {
+#ifdef CONFIG_MMU_NOTIFIER
+ struct mmu_notifier ep_mmu_notifier;
+#endif
+ bool ep_mn_registered;
+ /* List of temp registration windows for self */
+ struct list_head tc_reg_list;
+ struct mm_struct *mm;
+ struct endpt *ep;
+ struct list_head list_member;
+};
+
+/* Per Endpoint Remote Memory Access Information */
+struct endpt_rma_info {
+ /* List of registration windows for self */
+ struct list_head reg_list;
+ /* List of registration windows for peer */
+ struct list_head remote_reg_list;
+ /* Offset generator */
+ struct va_gen_addr va_gen;
+ /*
+ * Synchronizes access to self/remote list and also
+ * protects the window from being destroyed while
+ * RMAs are in progress.
+ */
+ struct mutex rma_lock;
+ /*
+ * Synchronizes access to temporary cached windows list
+ * for SCIF Registration Caching.
+ */
+ spinlock_t tc_lock;
+ /*
+ * Synchronizes access to the list of MMU notifiers
+ * registered for this SCIF endpoint.
+ */
+ struct mutex mmn_lock;
+ /*
+ * Synchronizes access to the SCIF registered address space
+ * offset generator.
+ */
+ struct mutex va_lock;
+ /*
+ * Keeps track of number of outstanding temporary registered
+ * windows created by scif_vreadfrom/scif_vwriteto which have
+ * not been destroyed. tcw refers to the number of temporary
+ * cached windows and total number of pages pinned.
+ */
+ atomic_t tw_refcount;
+ atomic_t tw_total_pages;
+ atomic_t tcw_refcount;
+ atomic_t tcw_total_pages;
+ /*
+ * MMU notifier so that we can destroy the windows when there is
+ * a change
+ */
+ struct list_head mmn_list;
+ /*
+ * Keeps track of number of outstanding remote fence requests
+ * which have been received by the peer.
+ */
+ int fence_refcount;
+ /*
+ * The close routine blocks on this wait queue to ensure that all
+ * remote fence requests have been serviced.
+ */
+ wait_queue_head_t fence_wq;
+ /*
+ * DMA channel used for all DMA transfers for this endpoint.
+ */
+ struct dma_channel *dma_chan;
+ /* Detect asynchronous list entry deletion */
+ int async_list_del;
+#ifdef _MIC_SCIF_
+ /* Local P2P proxy DMA virtual address for SUD updates by peer */
+ void *proxy_dma_va;
+ /* Local P2P proxy DMA physical address location for SUD updates */
+ dma_addr_t proxy_dma_phys;
+ /* Remote P2P proxy DMA physical address location for SUD updates */
+ dma_addr_t proxy_dma_peer_phys;
+#endif
+ /* List of tasks which have remote memory mappings */
+ struct list_head task_list;
+};
+
+/* Information used for tracking remote fence requests */
+struct fence_info {
+ /* State of this transfer */
+ enum micscif_msg_state state;
+
+ /* Fences wait on this queue */
+ wait_queue_head_t wq;
+
+ /* Used for storing the DMA mark */
+ int dma_mark;
+};
+
+/* Per remote fence wait request */
+struct remote_fence_info {
+ /* The SCIF_WAIT message */
+ struct nodemsg msg;
+
+ struct list_head list_member;
+};
+
+/* Self or Peer window */
+enum rma_window_type {
+ RMA_WINDOW_SELF = 0x1,
+ RMA_WINDOW_PEER
+};
+
+/* The number of physical addresses that can be stored in a PAGE. */
+#define NR_PHYS_ADDR_IN_PAGE (PAGE_SIZE >> 3)
+
+/*
+ * Store an array of lookup offsets. Each offset in this array maps
+ * one 4K page containing 512 physical addresses i.e. 2MB. 512 such
+ * offsets in a 4K page will correspond to 1GB of registered address space.
+ */
+struct rma_lookup {
+ /* Array of offsets */
+ dma_addr_t *lookup;
+ /* Offset used to map lookup array */
+ dma_addr_t offset;
+};
+
+
+/*
+ * A set of pinned pages obtained with scif_pin_pages() which could be part
+ * of multiple registered windows across different end points.
+ */
+struct scif_pinned_pages {
+ int64_t nr_pages;
+ int prot;
+ int map_flags;
+ atomic_t ref_count;
+ uint64_t magic;
+ /*
+ * Array of pointers to struct pages populated
+ * with get_user_pages(..)
+ */
+ struct page **pages;
+ int *num_pages;
+ int64_t nr_contig_chunks;
+ /* Only for Hosts without THP but with Huge TLB FS Like SuSe11 SP1 */
+ struct vm_area_struct **vma;
+};
+
+/*
+ * Information about a particular task which has remote memory mappings
+ * created via scif_mmap(..).
+ */
+struct rma_task_info {
+ /*
+ * Stores the pid struct of the grp_leader task structure which
+ * scif_mmap(..)'d the remote window.
+ */
+ struct pid *pid;
+ int ref_count;
+ struct list_head list_member;
+};
+
+/* Registration Window for Self */
+struct reg_range_t {
+ int64_t nr_pages;
+ /* Number of contiguous physical chunks */
+ int64_t nr_contig_chunks;
+ int prot;
+ int ref_count;
+ /* Cookie to detect corruption */
+ uint64_t magic;
+ uint64_t offset;
+ /* va address that this window represents
+ * Useful for only for temp windows*/
+ void *va_for_temp;
+ /* Used for temporary windows*/
+ int dma_mark;
+ /*
+ * Pointer to EP. Useful for passing EP around
+ * with messages to avoid expensive list
+ * traversals.
+ */
+ uint64_t ep;
+
+ struct list_head list_member;
+
+ enum rma_window_type type;
+
+ /*
+ * Pointer to peer window. Useful for sending
+ * messages to peer without requiring an
+ * extra list traversal
+ */
+ uint64_t peer_window;
+
+ /* Unregistration state */
+ enum micscif_msg_state unreg_state;
+
+ /*
+ * True for temporary windows created via
+ * scif_vreadfrom/scif_vwriteto.
+ */
+ bool temp;
+
+ bool offset_freed;
+
+ /* Local P2P proxy DMA physical address location for SUD updates */
+ dma_addr_t proxy_dma_phys;
+
+ union {
+ /* Self RAS */
+ struct {
+ /* The set of pinned_pages backing this window */
+ struct scif_pinned_pages *pinned_pages;
+
+ /* Handle for sending ALLOC_REQ */
+ struct allocmsg alloc_handle;
+
+ /* Wait Queue for an registration (N)ACK */
+ wait_queue_head_t regwq;
+
+ /* Registration state */
+ enum micscif_msg_state reg_state;
+
+ /* Wait Queue for an unregistration (N)ACK */
+ wait_queue_head_t unregwq;
+ };
+ /* Peer RAS specific window elements */
+ struct {
+#ifdef CONFIG_ML1OM
+ /* Lookup for physical addresses used for mmap */
+ struct rma_lookup phys_addr_lookup;
+
+ /* Lookup for temp physical addresses used for mmap */
+ struct rma_lookup temp_phys_addr_lookup;
+
+ /* Mmap state */
+ enum micscif_msg_state gttmap_state;
+
+ /* Wait Queue for an unregistration (N)ACK */
+ wait_queue_head_t gttmapwq;
+
+ /* Ref count per page */
+ int *page_ref_count;
+#endif
+ /* Lookup for physical addresses used for DMA */
+ struct rma_lookup dma_addr_lookup;
+
+ /* Number of entries in lookup */
+ int nr_lookup;
+
+ /* Offset used to map the window by the peer */
+ dma_addr_t mapped_offset;
+
+ /* Ref count for tracking scif_get_pages */
+ int get_put_ref_count;
+ };
+ };
+#ifdef CONFIG_ML1OM
+ /* Array of physical addresses used for creating VtoP mappings */
+ /* FIXME: these are phys_addr as seen by the peer node, node at the
+ * opposite end of the endpt
+ */
+ dma_addr_t *phys_addr;
+
+ /* Temporary array for storing physical addresses for performance */
+ dma_addr_t *temp_phys_addr;
+#endif
+
+ /* Array of physical addresses used for Host & MIC initiated DMA */
+ dma_addr_t *dma_addr;
+
+ /* Array specifying number of pages for each physical address */
+ int *num_pages;
+ struct mm_struct *mm;
+} __attribute__ ((packed));
+
+
+#define RMA_MAGIC(x) BUG_ON(x->magic != SCIFEP_MAGIC)
+
+/* If this bit is set then the mark is a remote fence mark */
+#define SCIF_REMOTE_FENCE_BIT 30
+/* Magic value used to indicate a remote fence request */
+#define SCIF_REMOTE_FENCE (1ULL << SCIF_REMOTE_FENCE_BIT)
+
+enum rma_direction {
+ LOCAL_TO_REMOTE,
+ REMOTE_TO_LOCAL
+};
+
+/* Initialize RMA for this EP */
+int micscif_rma_ep_init(struct endpt *ep);
+
+/* Check if epd can be uninitialized */
+int micscif_rma_ep_can_uninit(struct endpt *ep);
+
+/* Obtain a new offset. Callee must grab RMA lock */
+int micscif_get_window_offset(struct endpt *ep, int flags,
+ uint64_t offset, size_t len, uint64_t *out_offset);
+
+/* Free offset. Callee must grab RMA lock */
+void micscif_free_window_offset(struct endpt *ep,
+ uint64_t offset, size_t len);
+
+/* Create self registration window */
+struct reg_range_t *micscif_create_window(struct endpt *ep,
+ int64_t nr_pages, uint64_t offset, bool temp);
+
+/* Create a set of pinned pages */
+struct scif_pinned_pages *micscif_create_pinned_pages(int nr_pages, int prot);
+
+/* Destroy a set of pinned pages */
+int micscif_destroy_pinned_pages(struct scif_pinned_pages *pages);
+
+/* Destroy self registration window.*/
+int micscif_destroy_window(struct endpt *ep, struct reg_range_t *window);
+
+int micscif_destroy_incomplete_window(struct endpt *ep, struct reg_range_t *window);
+
+/* Map pages of self window to Aperture/PCI */
+int micscif_map_window_pages(struct endpt *ep, struct reg_range_t *window, bool temp);
+
+/* Unregister a self window */
+int micscif_unregister_window(struct reg_range_t *window);
+
+/* Create remote registration window */
+struct reg_range_t *micscif_create_remote_window(struct endpt *ep, int nr_pages);
+
+/* Destroy remote registration window */
+void micscif_destroy_remote_window(struct endpt *ep, struct reg_range_t *window);
+
+int micscif_send_alloc_request(struct endpt *ep, struct reg_range_t *window);
+
+/* Prepare a remote registration window */
+int micscif_prep_remote_window(struct endpt *ep, struct reg_range_t *window);
+
+/* Create remote lookup entries for physical addresses */
+int micscif_create_remote_lookup(struct endpt *ep, struct reg_range_t *window);
+
+/* Destroy remote lookup entries for physical addresses */
+void micscif_destroy_remote_lookup(struct endpt *ep, struct reg_range_t *window);
+
+/* Send a SCIF_REGISTER message and wait for an ACK */
+int micscif_send_scif_register(struct endpt *ep, struct reg_range_t *window);
+
+/* Send a SCIF_UNREGISTER message */
+int micscif_send_scif_unregister(struct endpt *ep, struct reg_range_t *window);
+
+/* RMA copy API */
+int micscif_rma_copy(scif_epd_t epd, off_t loffset, void *addr, size_t len,
+ off_t roffset, int flags, enum rma_direction dir, bool last_chunk);
+
+/* Sends a remote fence mark request */
+int micscif_send_fence_mark(scif_epd_t epd, int *out_mark);
+
+/* Sends a remote fence wait request */
+int micscif_send_fence_wait(scif_epd_t epd, int mark);
+
+/* Sends a remote fence signal request */
+int micscif_send_fence_signal(scif_epd_t epd, off_t roff, uint64_t rval,
+ off_t loff, uint64_t lval, int flags);
+
+/* Setup a DMA mark for an endpoint */
+int micscif_fence_mark(scif_epd_t epd);
+
+void ep_unregister_mmu_notifier(struct endpt *ep);
+#ifdef CONFIG_MMU_NOTIFIER
+void micscif_mmu_notif_handler(struct work_struct *work);
+#endif
+
+void micscif_rma_destroy_temp_windows(void);
+void micscif_rma_destroy_tcw_ep(struct endpt *ep);
+void micscif_rma_destroy_tcw_invalid(struct list_head *list);
+
+void micscif_rma_handle_remote_fences(void);
+
+/* Reserve a DMA channel for a particular endpoint */
+int micscif_reserve_dma_chan(struct endpt *ep);
+
+/* Program DMA SUD's after verifying the registered offset */
+int micscif_prog_signal(scif_epd_t epd, off_t offset, uint64_t val,
+ enum rma_window_type type);
+
+/* Kill any applications which have valid remote memory mappings */
+void micscif_kill_apps_with_mmaps(int node);
+
+/* Query if any applications have remote memory mappings */
+bool micscif_rma_do_apps_have_mmaps(int node);
+
+/* Get a reference to the current task which is creating a remote memory mapping */
+int micscif_rma_get_task(struct endpt *ep, int nr_pages);
+
+/* Release a reference to the current task which is destroying a remote memory mapping */
+void micscif_rma_put_task(struct endpt *ep, int nr_pages);
+
+/* Cleanup remote registration lists for zombie endpoints */
+void micscif_cleanup_rma_for_zombies(int node);
+
+#ifdef _MIC_SCIF_
+void micscif_teardown_proxy_dma(struct endpt *ep);
+#endif
+
+static __always_inline
+bool is_unaligned(off_t src_offset, off_t dst_offset)
+{
+ src_offset = src_offset & (L1_CACHE_BYTES - 1);
+ dst_offset = dst_offset & (L1_CACHE_BYTES - 1);
+ if (src_offset == dst_offset)
+ return false;
+ else
+ return true;
+}
+
+static __always_inline
+int __scif_readfrom(scif_epd_t epd, off_t loffset, size_t len,
+ off_t roffset, int flags)
+{
+ int err;
+
+ pr_debug("SCIFAPI readfrom: ep %p loffset 0x%lx len 0x%lx"
+ " offset 0x%lx flags 0x%x\n",
+ epd, loffset, len, roffset, flags);
+
+ if (is_unaligned(loffset, roffset)) {
+ while(len > MAX_UNALIGNED_BUF_SIZE) {
+ err = micscif_rma_copy(epd, loffset, NULL,
+ MAX_UNALIGNED_BUF_SIZE,
+ roffset, flags, REMOTE_TO_LOCAL, false);
+ if (err)
+ goto readfrom_err;
+ loffset += MAX_UNALIGNED_BUF_SIZE;
+ roffset += MAX_UNALIGNED_BUF_SIZE;
+ len -=MAX_UNALIGNED_BUF_SIZE;
+ }
+ }
+ err = micscif_rma_copy(epd, loffset, NULL, len,
+ roffset, flags, REMOTE_TO_LOCAL, true);
+readfrom_err:
+ return err;
+}
+
+static __always_inline
+int __scif_writeto(scif_epd_t epd, off_t loffset, size_t len,
+ off_t roffset, int flags)
+{
+ int err;
+
+ pr_debug("SCIFAPI writeto: ep %p loffset 0x%lx len 0x%lx"
+ " roffset 0x%lx flags 0x%x\n",
+ epd, loffset, len, roffset, flags);
+
+ if (is_unaligned(loffset, roffset)) {
+ while(len > MAX_UNALIGNED_BUF_SIZE) {
+ err = micscif_rma_copy(epd, loffset, NULL,
+ MAX_UNALIGNED_BUF_SIZE,
+ roffset, flags, LOCAL_TO_REMOTE, false);
+ if (err)
+ goto writeto_err;
+ loffset += MAX_UNALIGNED_BUF_SIZE;
+ roffset += MAX_UNALIGNED_BUF_SIZE;
+ len -= MAX_UNALIGNED_BUF_SIZE;
+ }
+ }
+ err = micscif_rma_copy(epd, loffset, NULL, len,
+ roffset, flags, LOCAL_TO_REMOTE, true);
+writeto_err:
+ return err;
+}
+
+static __always_inline
+int __scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset, int flags)
+{
+ int err;
+
+ pr_debug("SCIFAPI vreadfrom: ep %p addr %p len 0x%lx"
+ " roffset 0x%lx flags 0x%x\n",
+ epd, addr, len, roffset, flags);
+
+ if (is_unaligned((off_t)addr, roffset)) {
+ if (len > MAX_UNALIGNED_BUF_SIZE)
+ flags &= ~SCIF_RMA_USECACHE;
+
+ while(len > MAX_UNALIGNED_BUF_SIZE) {
+ err = micscif_rma_copy(epd, 0, addr,
+ MAX_UNALIGNED_BUF_SIZE,
+ roffset, flags, REMOTE_TO_LOCAL, false);
+ if (err)
+ goto vreadfrom_err;
+ addr = (void *)((uint64_t)addr + MAX_UNALIGNED_BUF_SIZE);
+ roffset += MAX_UNALIGNED_BUF_SIZE;
+ len -= MAX_UNALIGNED_BUF_SIZE;
+ }
+ }
+ err = micscif_rma_copy(epd, 0, addr, len,
+ roffset, flags, REMOTE_TO_LOCAL, true);
+vreadfrom_err:
+ return err;
+}
+
+static __always_inline
+int __scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset, int flags)
+{
+ int err;
+
+ pr_debug("SCIFAPI vwriteto: ep %p addr %p len 0x%lx"
+ " roffset 0x%lx flags 0x%x\n",
+ epd, addr, len, roffset, flags);
+
+ if (is_unaligned((off_t)addr, roffset)) {
+ if (len > MAX_UNALIGNED_BUF_SIZE)
+ flags &= ~SCIF_RMA_USECACHE;
+
+ while(len > MAX_UNALIGNED_BUF_SIZE) {
+ err = micscif_rma_copy(epd, 0, addr,
+ MAX_UNALIGNED_BUF_SIZE,
+ roffset, flags, LOCAL_TO_REMOTE, false);
+ if (err)
+ goto vwriteto_err;
+ addr = (void *)((uint64_t)addr + MAX_UNALIGNED_BUF_SIZE);
+ roffset += MAX_UNALIGNED_BUF_SIZE;
+ len -= MAX_UNALIGNED_BUF_SIZE;
+ }
+ }
+ err = micscif_rma_copy(epd, 0, addr, len,
+ roffset, flags, LOCAL_TO_REMOTE, true);
+vwriteto_err:
+ return err;
+}
+
+void micscif_rma_completion_cb(uint64_t data);
+
+int micscif_pci_dev(uint16_t node, struct pci_dev **pdev);
+#ifndef _MIC_SCIF_
+int micscif_pci_info(uint16_t node, struct scif_pci_info *dev);
+#endif
+
+/*
+ * nr_pages in a 2MB page is specified via the top 12 bits in the
+ * physical address.
+ */
+
+/* Check all parenthesis in these macros. See if putting in bottom makes sense? */
+#define RMA_HUGE_NR_PAGE_SHIFT ((52))
+#define RMA_HUGE_NR_PAGE_MASK (((0xFFFULL) << RMA_HUGE_NR_PAGE_SHIFT))
+#define RMA_GET_NR_PAGES(addr) ((addr) >> RMA_HUGE_NR_PAGE_SHIFT)
+#define RMA_SET_NR_PAGES(addr, nr_pages) ((addr) = (((nr_pages) & 0xFFFULL) << RMA_HUGE_NR_PAGE_SHIFT) | ((uint64_t)(addr)))
+#define RMA_GET_ADDR(addr) ((addr) & ~(RMA_HUGE_NR_PAGE_MASK))
+
+extern bool mic_huge_page_enable;
+
+#define SCIF_HUGE_PAGE_SHIFT 21
+
+/*
+ * micscif_is_huge_page:
+ * @page: A physical page.
+ */
+static __always_inline int
+micscif_is_huge_page(struct scif_pinned_pages *pinned_pages, int index)
+{
+ int huge = 0;
+ struct page *page = pinned_pages->pages[index];
+
+ if (compound_order(page) + PAGE_SHIFT == SCIF_HUGE_PAGE_SHIFT)
+ huge = 1;
+ if (huge)
+ ms_info.nr_2mb_pages++;
+ if (!mic_huge_page_enable)
+ huge = 0;
+#ifdef RMA_DEBUG
+ WARN_ON(!page_count(page));
+ WARN_ON(page_mapcount(page) < 0);
+#endif
+ return huge;
+}
+
+/*
+ * micscif_detect_large_page:
+ * @pinned_pages: A set of pinned pages.
+ */
+static __always_inline int
+micscif_detect_large_page(struct scif_pinned_pages *pinned_pages, char *addr)
+{
+ int i = 0, nr_pages, huge;
+ char *next_huge, *end;
+ char *end_addr = addr + (pinned_pages->nr_pages << PAGE_SHIFT);
+
+ while (addr < end_addr) {
+ huge = micscif_is_huge_page(pinned_pages, i);
+ if (huge) {
+ next_huge = (char *)ALIGN(
+ (unsigned long)(addr + 1),
+ PMD_SIZE);
+ end = next_huge > end_addr ? end_addr : next_huge;
+ nr_pages = (int)((end - addr) >> PAGE_SHIFT);
+ pinned_pages->num_pages[i] = (int)nr_pages;
+ addr = end;
+ i += (int)nr_pages;
+
+ } else {
+ pinned_pages->num_pages[i] = 1;
+ i++;
+ addr += PAGE_SIZE;
+ ms_info.nr_4k_pages++;
+ }
+ pinned_pages->nr_contig_chunks++;
+ }
+ return 0;
+}
+
+/**
+ * micscif_set_nr_pages:
+ * @ep: end point
+ * @window: self registration window
+ *
+ * Set nr_pages in every entry of physical address/dma address array
+ * and also remove nr_pages information from physical addresses.
+ */
+static __always_inline void
+micscif_set_nr_pages(struct micscif_dev *dev, struct reg_range_t *window)
+{
+ int j;
+#ifdef CONFIG_ML1OM
+ int l = 0, k;
+#endif
+
+ for (j = 0; j < window->nr_contig_chunks; j++) {
+ window->num_pages[j] = RMA_GET_NR_PAGES(window->dma_addr[j]);
+ if (window->num_pages[j])
+ window->dma_addr[j] = RMA_GET_ADDR(window->dma_addr[j]);
+ else
+ break;
+#ifdef CONFIG_ML1OM
+ for (k = 0; k < window->num_pages[j]; k++)
+ if (window->temp_phys_addr[j])
+ window->phys_addr[l + k] =
+ RMA_GET_ADDR(window->temp_phys_addr[j]) + (k << PAGE_SHIFT);
+ l += window->num_pages[j];
+#endif
+ }
+}
+
+#ifdef CONFIG_ML1OM
+/*
+ * micscif_get_phys_addr:
+ * Obtain the phys_addr given the window and the offset.
+ * @window: Registered window.
+ * @off: Window offset.
+ */
+static __always_inline dma_addr_t
+micscif_get_phys_addr(struct reg_range_t *window, uint64_t off)
+{
+ int page_nr = (off - window->offset) >> PAGE_SHIFT;
+ off_t page_off = off & ~PAGE_MASK;
+ return window->phys_addr[page_nr] | page_off;
+}
+#endif
+
+#define RMA_ERROR_CODE (~(dma_addr_t)0x0)
+
+/*
+ * micscif_get_dma_addr:
+ * Obtain the dma_addr given the window and the offset.
+ * @window: Registered window.
+ * @off: Window offset.
+ * @nr_bytes: Return the number of contiguous bytes till next DMA addr index.
+ * @index: Return the index of the dma_addr array found.
+ * @start_off: start offset of index of the dma addr array found.
+ * The nr_bytes provides the callee an estimate of the maximum possible
+ * DMA xfer possible while the index/start_off provide faster lookups
+ * for the next iteration.
+ */
+static __always_inline dma_addr_t
+micscif_get_dma_addr(struct reg_range_t *window, uint64_t off, size_t *nr_bytes, int *index, uint64_t *start_off)
+{
+ if (window->nr_pages == window->nr_contig_chunks) {
+ int page_nr = (int)((off - window->offset) >> PAGE_SHIFT);
+ off_t page_off = off & ~PAGE_MASK;
+ if (nr_bytes)
+ *nr_bytes = PAGE_SIZE - page_off;
+ if (page_nr >= window->nr_pages) {
+ printk(KERN_ERR "%s dma_addr out of boundary\n", __FUNCTION__);
+ }
+ return window->dma_addr[page_nr] | page_off;
+ } else {
+ int i = index ? *index : 0;
+ uint64_t end;
+ uint64_t start = start_off ? *start_off : window->offset;
+ for (; i < window->nr_contig_chunks; i++) {
+ end = start + (window->num_pages[i] << PAGE_SHIFT);
+ if (off >= start && off < end) {
+ if (index)
+ *index = i;
+ if (start_off)
+ *start_off = start;
+ if (nr_bytes)
+ *nr_bytes = end - off;
+ return (window->dma_addr[i] + (off - start));
+ }
+ start += (window->num_pages[i] << PAGE_SHIFT);
+ }
+ }
+#ifdef CONFIG_MK1OM
+ printk(KERN_ERR "%s %d BUG. Addr not found? window %p off 0x%llx\n", __func__, __LINE__, window, off);
+ BUG_ON(1);
+#endif
+ return RMA_ERROR_CODE;
+}
+
+/*
+ * scif_memset:
+ * @va: kernel virtual address
+ * @c: The byte used to fill the memory
+ * @size: Buffer size
+ *
+ * Helper API which fills size bytes of memory pointed to by va with the
+ * constant byte c. This API fills the memory in chunks of 4GB - 1 bytes
+ * for a single invocation of memset(..) to work around a kernel bug in
+ * x86_64 @ https://bugzilla.kernel.org/show_bug.cgi?id=27732
+ * where memset(..) does not do "ANY" work for size >= 4GB.
+ * This kernel bug has been fixed upstream in v3.2 via the commit
+ * titled "x86-64: Fix memset() to support sizes of 4Gb and above"
+ * but has not been backported to distributions like RHEL 6.3 yet.
+ */
+static __always_inline void scif_memset(char *va, int c, size_t size)
+{
+ size_t loop_size;
+ const size_t four_gb = 4 * 1024 * 1024 * 1024ULL;
+
+ while (size) {
+ loop_size = min(size, four_gb - 1);
+ memset(va, c, loop_size);
+ size -= loop_size;
+ va += loop_size;
+ }
+}
+
+/*
+ * scif_zalloc:
+ * @size: Size of the allocation request.
+ *
+ * Helper API which attempts to allocate zeroed pages via
+ * __get_free_pages(..) first and then falls back on
+ * vmalloc(..) if that fails. This is required because
+ * vmalloc(..) is *slow*.
+ */
+static __always_inline void *scif_zalloc(size_t size)
+{
+ void *ret;
+ size_t align = ALIGN(size, PAGE_SIZE);
+
+ if (!align)
+ return NULL;
+
+ if (align <= (1 << (MAX_ORDER + PAGE_SHIFT - 1)))
+ if ((ret = (void*)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(align))))
+ goto done;
+ if (!(ret = vmalloc(align)))
+ return NULL;
+
+ /* TODO: Use vzalloc once kernel supports it */
+ scif_memset(ret, 0, size);
+done:
+#ifdef RMA_DEBUG
+ atomic_long_add_return(align, &ms_info.rma_alloc_cnt);
+#endif
+ return ret;
+}
+
+/*
+ * scif_free:
+ * @addr: Address to be freed.
+ * @size: Size of the allocation.
+ * Helper API which frees memory allocated via scif_zalloc().
+ */
+static __always_inline void scif_free(void *addr, size_t size)
+{
+ size_t align = ALIGN(size, PAGE_SIZE);
+
+ if (unlikely(is_vmalloc_addr(addr)))
+ vfree(addr);
+ else {
+ free_pages((unsigned long)addr, get_order(align));
+ }
+#ifdef RMA_DEBUG
+ WARN_ON(atomic_long_sub_return(align, &ms_info.rma_alloc_cnt) < 0);
+#endif
+}
+
+static __always_inline void
+get_window_ref_count(struct reg_range_t *window, int64_t nr_pages)
+{
+ window->ref_count += (int)nr_pages;
+}
+
+static __always_inline void
+put_window_ref_count(struct reg_range_t *window, int64_t nr_pages)
+{
+ window->ref_count -= (int)nr_pages;
+ BUG_ON(window->nr_pages < 0);
+}
+
+static __always_inline void
+set_window_ref_count(struct reg_range_t *window, int64_t nr_pages)
+{
+ window->ref_count = (int)nr_pages;
+}
+
+/* Debug API's */
+void micscif_display_window(struct reg_range_t *window, const char *s, int line);
+static inline struct mm_struct *__scif_acquire_mm(void)
+{
+ if (mic_ulimit_check) {
+#ifdef RMA_DEBUG
+ atomic_long_add_return(1, &ms_info.rma_mm_cnt);
+#endif
+ return get_task_mm(current);
+ }
+ return NULL;
+}
+
+static inline void __scif_release_mm(struct mm_struct *mm)
+{
+ if (mic_ulimit_check && mm) {
+#ifdef RMA_DEBUG
+ WARN_ON(atomic_long_sub_return(1, &ms_info.rma_mm_cnt) < 0);
+#endif
+ mmput(mm);
+ }
+}
+
+static inline int __scif_dec_pinned_vm_lock(struct mm_struct *mm,
+ int64_t nr_pages, bool try_lock)
+{
+ if (mm && nr_pages && mic_ulimit_check) {
+ if (try_lock) {
+ if (!down_write_trylock(&mm->mmap_sem)) {
+ return -1;
+ }
+ } else {
+ down_write(&mm->mmap_sem);
+ }
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0))
+ mm->pinned_vm -= nr_pages;
+#else
+ mm->locked_vm -= nr_pages;
+#endif
+ up_write(&mm->mmap_sem);
+ }
+ return 0;
+}
+
+static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm,
+ int64_t nr_pages)
+{
+ if (mm && mic_ulimit_check && nr_pages) {
+ unsigned long locked, lock_limit;
+ locked = nr_pages;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0))
+ locked += mm->pinned_vm;
+#else
+ locked += mm->locked_vm;
+#endif
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ pr_debug("locked(%lu) > lock_limit(%lu)\n",
+ locked, lock_limit);
+ return -ENOMEM;
+ } else {
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0))
+ mm->pinned_vm = locked;
+#else
+ mm->locked_vm = locked;
+#endif
+ }
+ }
+ return 0;
+}
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MICSCIF_RMA_LIST_H
+#define MICSCIF_RMA_LIST_H
+
+/*
+ * RMA Linked List Manipulation API's.
+ * Callee must Hold RMA lock to call the API's below.
+ * When and if RMA uses RB trees for log(n) search,
+ * similar API's should be implemented.
+ */
+
+/*
+ * Specifies whether an RMA operation can span
+ * across partial windows, a single window or multiple
+ * contiguous windows.
+ * Mmaps can span across parial windows.
+ * Unregistration can span across complete windows.
+ * scif_get_pages() can span a single window.
+ */
+enum range_request {
+ WINDOW_PARTIAL,
+ WINDOW_SINGLE,
+ WINDOW_FULL
+};
+
+/* Self Registration list RMA Request query */
+struct micscif_rma_req {
+ struct reg_range_t **out_window;
+ uint64_t offset;
+ size_t nr_bytes;
+ int prot;
+ enum range_request type;
+ struct list_head *head;
+ void *va_for_temp;
+};
+
+/**
+ * struct mic_copy_work:
+ *
+ * Work for DMA copy thread is provided by alloocating and preparing
+ * struct mic_copy_work and calling mic_enqueue_copy_work.
+ */
+struct mic_copy_work {
+ uint64_t src_offset;
+
+ uint64_t dst_offset;
+
+ /* Starting src registered window */
+ struct reg_range_t *src_window;
+
+ /* Starting dst registered window */
+ struct reg_range_t *dst_window;
+
+ /* Is this transfer a loopback transfer? */
+ int loopback;
+
+ size_t len;
+ /* DMA copy completion callback. Details in mic_dma_lib.h */
+ struct dma_completion_cb *comp_cb;
+
+ struct micscif_dev *remote_dev;
+
+ /* DO_DMA_POLLING or DO_DMA_INTR or none */
+ int fence_type;
+
+ bool ordered;
+
+#ifdef CONFIG_ML1OM
+ /* GTT map state */
+ enum micscif_msg_state gttmap_state;
+
+ /* Wait Queue for a GTT map (N)ACK */
+ wait_queue_head_t gttmapwq;
+
+ uint64_t gtt_offset;
+
+ uint64_t gtt_length;
+
+#endif
+ bool dma_chan_released;
+ struct list_head list_member;
+};
+
+/* Insert */
+void micscif_insert_window(struct reg_range_t *window, struct list_head *head);
+void micscif_insert_tcw(struct reg_range_t *window,
+ struct list_head *head);
+
+/* Query */
+int micscif_query_window(struct micscif_rma_req *request);
+int micscif_query_tcw(struct endpt *ep, struct micscif_rma_req *request);
+
+/* Called from close to unregister all self windows */
+int micscif_unregister_all_windows(scif_epd_t epd);
+
+/* Traverse list and munmap */
+void micscif_rma_list_munmap(struct reg_range_t *window, uint64_t offset, int nr_pages);
+/* Traverse list and mmap */
+int micscif_rma_list_mmap(struct reg_range_t *start_window,
+ uint64_t offset, int nr_pages, struct vm_area_struct *vma);
+/* Traverse list and unregister */
+int micscif_rma_list_unregister(struct reg_range_t *window, uint64_t offset, int nr_pages);
+
+/* CPU copy */
+int micscif_rma_list_cpu_copy(struct mic_copy_work *work);
+
+/* Traverse remote RAS and ensure none of the get_put_ref_counts are +ve */
+int micscif_rma_list_get_pages_check(struct endpt *ep);
+
+/* Debug API's */
+void micscif_display_all_windows(struct list_head *head);
+
+int micscif_rma_list_dma_copy_wrapper(struct endpt *epd, struct mic_copy_work *work, struct dma_channel *chan, off_t loffset);
+
+void micscif_rma_local_cpu_copy(uint64_t offset, struct reg_range_t *window, uint8_t *temp, size_t remaining_len, bool to_temp);
+
+#endif /* MICSCIF_RMA_LIST_H */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MIC_SMPT_H
+#define MIC_SMPT_H
+
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+
+#define MAX_BOARD_SUPPORTED 256
+
+#define SNOOP_ON (0 << 0)
+#define SNOOP_OFF (1 << 0)
+#define NUM_SMPT_REGISTERS 32
+#define NUM_SMPT_ENTRIES_IN_USE 32
+#define SMPT_MASK 0x1F
+#define MIC_SYSTEM_PAGE_SHIFT 34ULL
+#define MIC_SYSTEM_PAGE_MASK ((1ULL << MIC_SYSTEM_PAGE_SHIFT) - 1ULL)
+
+struct _mic_ctx_t;
+struct pci_dev;
+
+typedef struct mic_smpt {
+ dma_addr_t dma_addr;
+ int64_t ref_count;
+} mic_smpt_t;
+
+
+/* Sbox Smpt Reg Bits:
+ * Bits 31:2 Host address
+ * Bits 1 RSVD
+ * Bits 0 No snoop
+ */
+#define BUILD_SMPT(NO_SNOOP, HOST_ADDR) \
+ (uint32_t)(((((HOST_ADDR)<< 2) & (~0x03)) | ((NO_SNOOP) & (0x01))))
+
+bool is_syspa(dma_addr_t hostmic_pa);
+
+dma_addr_t mic_map(int bid, dma_addr_t dma_addr, size_t size);
+void mic_unmap(int bid, dma_addr_t dma_addr, size_t size);
+
+dma_addr_t mic_map_single(int bid, struct pci_dev *hwdev, void *p, size_t size);
+void mic_unmap_single(int bid, struct pci_dev *hwdev, dma_addr_t mic_addr,
+ size_t size);
+
+dma_addr_t mic_ctx_map_single(struct _mic_ctx_t *mic_ctx, void *p, size_t size);
+void mic_ctx_unmap_single(struct _mic_ctx_t *mic_ctx, dma_addr_t dma_addr,
+ size_t size);
+
+dma_addr_t mic_to_dma_addr(int bid, dma_addr_t mic_addr);
+void mic_smpt_set(volatile void *mm_sbox, uint64_t dma_addr, uint64_t index);
+
+static inline
+bool mic_map_error(dma_addr_t mic_addr)
+{
+ return !mic_addr;
+}
+#endif // MIC_SMPT_H
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/* generate a virtual address for a given size */
+
+#ifndef MICSCIF_VA_GEN_H
+#define MICSCIF_VA_GEN_H
+
+#include "micscif_va_node.h"
+
+/*
+ * To avoid collisions with user applications trying to use
+ * MAP_FIXED with scif_register(), the following window address space
+ * allocation scheme is used.
+ *
+ * 1) (0) - (2 ^ 62 - 1))
+ * Window Address Space that can be claimed using MAP_FIXED.
+ * 2) (2 ^ 62) - (2 ^ 63) - 1)
+ * Window address space used for allocations by the SCIF driver
+ * when MAP_FIXED is not passed.
+ */
+#define VA_GEN_MIN 0x4000000000000000
+#define VA_GEN_RANGE 0x3f00000000000000
+
+#define INVALID_VA_GEN_ADDRESS 0xff00000000000000
+#define INVALID_VA_PAGE_INDEX 0xff00000000000
+
+struct va_gen_addr {
+ struct va_node_allocator allocator;
+ uint32_t hole_list;
+ uint32_t claims_list;
+ uint64_t base;
+};
+
+/*
+ * return a base for the range
+ * caller trusted to keep track of both base and range
+ */
+uint64_t va_gen_alloc(struct va_gen_addr *addr,
+ uint64_t num_bytes, uint32_t align_bytes);
+
+/* Claim ownership of memory region. Fails if already occupied */
+uint64_t va_gen_claim(struct va_gen_addr *addr,
+ uint64_t address, uint64_t num_bytes);
+
+/* release ownership of a base/range */
+void va_gen_free(struct va_gen_addr *addr,
+ uint64_t address, uint64_t num_bytes);
+
+int va_gen_init(struct va_gen_addr *addr, uint64_t base, uint64_t range);
+
+void va_gen_destroy(struct va_gen_addr *addr);
+
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/* generate a virtual address for a given size */
+#ifndef MICSCIF_VA_NODE_H
+#define MICSCIF_VA_NODE_H
+
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+
+#define invalid_va_node_index ((uint32_t)(-1))
+
+struct va_node {
+ uint32_t next;
+ uint64_t base;
+ uint64_t range;
+};
+
+struct va_node_allocator {
+ /* Emulated variable-size array
+ * is implemented as a sequence of fixed-sized slabs.
+ * SlabDirectory keeps the sequence.
+ * Slab is a contiguous block of nodes -- saves number of allocations
+ * when allocing a new slab of nodes, alloc this size
+ */
+ uint32_t slab_shift;
+ uint32_t nodes_in_slab;
+ uint32_t slab_mask;
+ struct va_node **pp_slab_directory;
+ uint32_t num_slabs;
+ uint32_t num_free_slabs;
+ uint32_t free_list;
+};
+
+int va_node_is_valid(uint32_t index);
+
+/*
+ * get the node corresponding to a NodePtr
+ * We are emulating a variable-size array
+ */
+struct va_node *va_node_get(struct va_node_allocator *node, uint32_t index);
+
+/* returns an NodePtr to a free node */
+int va_node_alloc(struct va_node_allocator *node, uint32_t *out_alloc);
+
+/* put a node back into the free pool, by NodePtr */
+void va_node_free(struct va_node_allocator *node, uint32_t index);
+
+void va_node_init(struct va_node_allocator *node);
+
+void va_node_destroy(struct va_node_allocator *node);
+
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MICVCONS_H
+#define MICVCONS_H
+
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+#include <mic/micscif.h>
+#include <mic/micscif_nm.h>
+
+#define MICVCONS_DEVICE_NAME "ttyMIC"
+
+#define MICVCONS_BUF_SIZE PAGE_SIZE
+#define MICDCONS_MAX_OUTPUT_BYTES 64
+#define MICVCONS_SHORT_TIMEOUT 100
+#define MICVCONS_MAX_TIMEOUT 500
+
+#define MIC_VCONS_READY 0xc0de
+#define MIC_VCONS_SLEEPING 0xd00d
+#define MIC_VCONS_WAKINGUP 0xd12d
+#define MIC_HOST_VCONS_READY 0xbad0
+#define MIC_VCONS_HOST_OPEN 0xbad1
+#define MIC_VCONS_RB_VER_ERR 0xbad2
+
+#define MICVCONS_TIMER_RESTART 1
+#define MICVCONS_TIMER_SHUTDOWN 0
+
+typedef struct micvcons {
+ int dc_enabled;
+ void *dc_hdr_virt;
+ void *dc_buf_virt;
+ dma_addr_t dc_hdr_dma_addr;
+ dma_addr_t dc_dma_addr;
+ uint32_t dc_size;
+} micvcons_t;
+
+typedef struct micvcons_port {
+ struct board_info *dp_bdinfo;
+ struct micvcons *dp_vcons;
+ struct micscif_rb *dp_in;
+ struct micscif_rb *dp_out;
+ struct tty_struct *dp_tty;
+ struct list_head list_member;
+ /*
+ * work queue to schedule work that wakes up a sleeping card
+ * and read the data from the buffer.
+ */
+ struct workqueue_struct *dp_wq;
+ struct work_struct dp_wakeup_read_buf;
+
+ spinlock_t dp_lock;
+ struct mutex dp_mutex;
+
+ volatile int dp_bytes;
+ volatile uint32_t dp_canread;
+
+ volatile struct file *dp_reader;
+ volatile struct file *dp_writer;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ struct tty_port port;
+#endif
+} micvcons_port_t;
+
+/* vcons IPC layout */
+struct vcons_buf
+{
+ uint32_t host_magic;
+ uint32_t mic_magic;
+
+ uint16_t host_rb_ver;
+ uint16_t mic_rb_ver;
+
+ /* mic o/p buffer */
+ dma_addr_t o_buf_dma_addr; /* host buf dma addr*/
+ uint32_t o_wr;
+ uint32_t o_size;
+
+ /* mic i/p buffer */
+ uint64_t i_hdr_addr; /* mic hdr addr */
+ uint64_t i_buf_addr; /* mic buf addr */
+ uint32_t i_rd;
+ uint32_t i_size;
+};
+
+struct vcons_mic_header
+{
+ uint32_t o_rd;
+ uint32_t i_wr;
+ uint32_t host_status;
+};
+
+int micvcons_start(struct _mic_ctx_t *mic_ctx);
+int micvcons_port_write(struct micvcons_port *port, const unsigned char *buf,
+ int count);
+struct _mic_ctx_t;
+void micvcons_stop(struct _mic_ctx_t *mic_ctx);
+int micvcons_pm_disconnect_node(uint8_t *node_bitmask, enum disconn_type type);
+#endif /* MICVCONS_H */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MICVETH_H
+#define MICVETH_H
+
+#include "micveth_dma.h"
+
+#include "micint.h"
+#include "micveth_common.h"
+
+#define MICVETH_MAX_PACKET_SIZE (63 * 1024)
+#define MICVETH_TRANSFER_FIFO_SIZE 128
+
+#define MICVETH_LINK_UP_MAGIC 0x1A77ABEE
+#define MICVETH_LINK_DOWN_MAGIC 0x1DEADBEE
+
+#define MICVETH_POLL_TIMER_DELAY 1
+#define MICVETH_CLIENT_TIMER_DELAY 10
+
+typedef struct ring_packet {
+ struct sk_buff *pd_skb;
+ uint64_t pd_phys;
+ uint64_t pd_length;
+} ring_packet_t;
+
+typedef struct ring_desc {
+ uint64_t rd_phys;
+ uint64_t rd_length;
+ uint32_t rd_valid;
+} ring_desc_t;
+
+typedef struct ring_queue {
+ uint32_t rq_head;
+ uint32_t rq_tail;
+ uint32_t rq_length;
+ ring_desc_t rq_descs[MICVETH_TRANSFER_FIFO_SIZE];
+} ring_queue_t;
+
+typedef struct ring {
+ ring_queue_t r_tx;
+ ring_queue_t r_rx;
+} veth_ring_t;
+
+#define VETH_STATE_INITIALIZED 0
+#define VETH_STATE_LINKUP 1
+#define VETH_STATE_LINKDOWN 2
+
+
+typedef struct micveth_info {
+ struct pci_dev *vi_pdev;
+ struct net_device *vi_netdev;
+ uint8_t *vi_sbox;
+ uint8_t *vi_dbox;
+ uint32_t *vi_scratch14;
+ uint32_t *vi_scratch15;
+ mic_ctx_t *mic_ctx;
+ volatile uint32_t vi_state;
+ uint32_t vi_skb_mtu;
+
+ struct delayed_work vi_poll;
+
+ struct workqueue_struct *vi_wq;
+ char vi_wqname[16];
+ struct work_struct vi_bh;
+ struct work_struct vi_txws;
+
+ spinlock_t vi_rxlock;
+ spinlock_t vi_txlock;
+
+ struct {
+ veth_ring_t ring;
+ uint64_t phys;
+ uint64_t length;
+ } vi_ring;
+
+ veth_ring_t *ring_ptr;
+
+ ring_packet_t vi_tx_desc[MICVETH_TRANSFER_FIFO_SIZE];
+ ring_packet_t vi_rx_desc[MICVETH_TRANSFER_FIFO_SIZE];
+ uint32_t vi_pend;
+} micveth_info_t;
+
+enum {
+ CLIENT_POLL_STOPPED,
+ CLIENT_POLL_RUNNING,
+ CLIENT_POLL_STOPPING,
+};
+
+typedef struct micveth {
+ int lv_num_interfaces;
+ int lv_num_clients;
+ int lv_active_clients;
+ int lv_num_links_remaining;
+ micveth_info_t *lv_info;
+
+ struct mutex lv_state_mutex;
+
+ uint32_t lv_pollstate;
+ struct delayed_work lv_poll;
+ wait_queue_head_t lv_wq;
+
+} micveth_t;
+
+int micveth_init(struct device *dev);
+int micveth_init_legacy(int num_bds, struct device *dev);
+void micveth_exit(void);
+int micveth_probe(mic_ctx_t *mic_ctx);
+void micveth_remove(mic_ctx_t *mic_ctx);
+int micveth_start(mic_ctx_t *mic_ctx);
+void micveth_stop(mic_ctx_t *mic_ctx);
+
+#endif /* MICVETH_H */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MICVETHCOMMON_H
+#define MICVETHCOMMON_H
+
+#ifndef ETH_HLEN
+#define ETH_HLEN 14
+#endif
+
+typedef enum micvnet_state {
+ MICVNET_STATE_UNDEFINED,
+ MICVNET_STATE_UNINITIALIZED,
+ MICVNET_STATE_LINKUP,
+ MICVNET_STATE_LINK_DOWN,
+ MICVNET_STATE_BEGIN_UNINIT,
+ MICVNET_STATE_TRANSITIONING,
+}micvnet_state;
+
+
+/*
+ * Fancy way of defining an enumeration and the mapping between them and
+ * the module parameter--they're guaranteed to be in sync this way.
+ */
+#define VNET_MODES \
+ __VNET_MODE(POLL, poll) \
+ __VNET_MODE(INTR, intr) \
+ __VNET_MODE(DMA, dma) \
+ /* end */
+#define __VNET_MODE(u, l) VNET_MODE_##u ,
+enum { VNET_MODES };
+#undef __VNET_MODE
+
+extern char *mic_vnet_modes[];
+extern int mic_vnet_mode;
+
+#endif /* MICVETHCOMMON_H */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MICVETH_DMA_H
+#define MICVETH_DMA_H
+
+#include <linux/kernel.h>
+#include "micint.h"
+
+#include "mic_common.h"
+#include "mic_dma_lib.h"
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+
+
+/*
+ Define this if only DMA mode is supported without legacy POLL/INTR modes
+ (i.e if only micveth_dma.c is included in the host/card side drivers, i.e
+ when linvnet.c is excluded from host side driver and micveth.c from card
+ side driver). This will ensure that other global symbols which are at
+ present common with legacy modes (in linvnet.c/micveth.c) are all included
+ in micveth_dma.c.
+*/
+#undef STANDALONE_VNET_DMA
+
+/*******************************************************/
+#define MICVNET_MSG_RB_SIZE 128
+#define DMA_ALIGNMENT L1_CACHE_BYTES
+#define VNET_MAX_SKBS 62
+
+/* The maximum total number of outstanding messages possible in the current
+ implementation is 2 * VNET_MAX_SKBS + 1. */
+#if (MICVNET_MSG_RB_SIZE < 2 * VNET_MAX_SKBS + 2)
+#error "MICVNET_MSG_RB_SIZE should be at least (2 * VNET_MAX_SKBS + 2)"
+#endif
+
+#if (MICVNET_MSG_RB_SIZE & (MICVNET_MSG_RB_SIZE - 1))
+#error "MICVNET_MSG_RB_SIZE should be power of 2"
+#endif
+
+enum micvnet_msg_id {
+ MICVNET_MSG_ADD_DMA_BUFFER,
+ MICVNET_MSG_DMA_COMPLETE,
+ MICVNET_MSG_LINK_DOWN,
+ MICVNET_MSG_LINK_UP,
+};
+
+struct micvnet_msg_add_dma_buffer {
+ uint64_t buf_phys;
+ uint64_t buf_size;
+};
+
+struct micvnet_msg_dma_complete {
+ uint64_t dst_phys;
+ uint64_t size;
+ uint64_t dma_offset;
+};
+
+#define VNET_DRIVER_VERSION 1
+struct micvnet_msg_link_up {
+ uint64_t vnet_driver_version;
+};
+
+union micvnet_msg_body {
+ struct micvnet_msg_add_dma_buffer micvnet_msg_add_dma_buffer;
+ struct micvnet_msg_dma_complete micvnet_msg_dma_complete;
+ struct micvnet_msg_link_up micvnet_msg_link_up;
+};
+
+struct micvnet_msg {
+ uint64_t msg_id;
+ union micvnet_msg_body body;
+};
+
+struct micvnet_msg_rb {
+ struct micvnet_msg buf[MICVNET_MSG_RB_SIZE];
+ volatile uint32_t head;
+ volatile uint32_t tail;
+ uint32_t size;
+ volatile uint32_t prev_head;
+ volatile uint32_t prev_tail;
+};
+
+struct micvnet_msg_ring_pair {
+ struct micvnet_msg_rb rb_tx;
+ struct micvnet_msg_rb rb_rx;
+};
+
+struct micvnet_msg_qp {
+ struct micvnet_msg_rb *tx;
+ struct micvnet_msg_rb *rx;
+};
+
+/*******************************************************/
+
+/* Restict micvnet mtu to 63K because ping does not work on RHEL 6.3 with 64K
+ MTU - HSD [4118026] */
+#define MICVNET_MAX_MTU (63 * 1024)
+#define MICVNET_CARD_UP_MAGIC 0x1A77BBEE
+
+struct rx_node {
+ struct list_head list;
+ struct sk_buff *skb;
+ uint64_t phys;
+ uint64_t size;
+};
+
+struct dma_node {
+ struct list_head list;
+ uint64_t phys;
+ uint64_t size;
+};
+
+struct tx_node {
+ struct list_head list;
+ struct sk_buff *skb;
+};
+
+struct sched_node {
+ struct list_head list;
+ struct sk_buff *skb;
+ unsigned char *skb_data_aligned;
+ uint64_t dma_src_phys;
+ uint64_t dma_size;
+ uint64_t dma_offset;
+ uint64_t dst_phys;
+};
+
+struct obj_list {
+ char *buf;
+ int size;
+ size_t obj_size;
+ volatile uint32_t head;
+ volatile uint32_t tail;
+};
+
+struct micvnet_info {
+ struct pci_dev *vi_pdev;
+ struct net_device *vi_netdev;
+ uint8_t *vi_sbox;
+ uint8_t *vi_dbox;
+ uint32_t *vi_scratch14;
+ mic_ctx_t *mic_ctx;
+ atomic_t vi_state;
+
+ struct workqueue_struct *vi_wq;
+ char vi_wqname[16];
+ struct work_struct vi_ws_bh;
+ struct work_struct vi_ws_tx;
+ struct work_struct vi_ws_dmacb;
+ struct work_struct vi_ws_link_down;
+ struct work_struct vi_ws_stop;
+ struct work_struct vi_ws_start;
+
+ spinlock_t vi_rxlock;
+ spinlock_t vi_txlock;
+
+#ifdef HOST
+ struct micvnet_msg_ring_pair vi_rp;
+#else
+ struct micvnet_msg_ring_pair *ring_ptr;
+#endif
+ uint64_t vi_rp_phys;
+ struct micvnet_msg_qp vi_qp;
+
+ struct obj_list dnode_list;
+
+ struct list_head vi_rx_skb;
+ struct list_head vi_dma_buf;
+ struct list_head vi_tx_skb;
+ struct list_head vi_sched_skb;
+
+ mic_dma_handle_t dma_handle;
+ struct dma_channel *dma_chan;
+ struct dma_completion_cb dma_cb;
+ atomic_t cnt_dma_complete;
+
+ atomic_t cnt_dma_buf_avail;
+ bool link_down_initiator;
+ atomic_t cnt_tx_pending;
+ wait_queue_head_t stop_waitq;
+};
+
+
+struct micvnet {
+ atomic_t lv_active_clients;
+ int created;
+};
+
+int micvnet_init(struct device *dev);
+void micvnet_exit(void);
+int micvnet_probe(mic_ctx_t *mic_ctx);
+void micvnet_remove(mic_ctx_t *mic_ctx);
+int micvnet_xmit(struct sk_buff *skb, struct net_device *dev);
+
+int micvnet_start(mic_ctx_t *mic_ctx);
+void micvnet_stop(mic_ctx_t *mic_ctx);
+
+#ifndef HOST
+int __init micvnet_module_init(void);
+void __exit micvnet_module_exit(void);
+#endif
+
+#ifdef STANDALONE_VNET_DMA
+#define micveth_init micvnet_init
+#define micveth_exit micvnet_exit
+#define micveth_probe micvnet_probe
+#define micveth_remove micvnet_remove
+#define micveth_start micvnet_start
+#define micveth_stop micvnet_stop
+#endif
+
+extern int vnet_num_buffers;
+#ifndef HOST
+extern ulong vnet_addr;
+#endif
+#endif // MICVETH_DMA_H
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+Description: This is a generic ring buffer implementation to be used by
+anyone who needs a ring buffer. The ring buffer is maipulated
+using Read and Write functions. These functions perform all of
+the necessary space checks and only complete the operation if
+if the requested number of items can be read or written. A
+return value of false indicates that either the ring buffer
+contains less then the requested number of items (for Read) or
+there isn't enough space left in the ring buffer (for Write).
+*/
+
+#ifndef _MICHOST_RING_BUFFER_DEFINE
+
+#define _MICHOST_RING_BUFFER_DEFINE
+
+//
+// Requirements:
+// Ring base should be already aligned properly
+// Ring size should be just multiple of the alignment size
+// All packets should be at least multiple of 4 bytes for the purpose of padding
+//
+
+#define RINGBUFFER_ALIGNMENT_SIZE 64 // in byte
+
+typedef struct _ringbuffer
+{
+ uint8_t *ringbuff_ptr;
+ volatile uint32_t *readptr; // Points to the read offset
+ volatile uint32_t *writeptr; // Points to the write offset
+ uint32_t ringbuffsize;
+ uint32_t curr_readoffset; // cache it to improve performance.
+ uint32_t curr_writeoffset; // cache it to improve performance.
+ uint32_t old_readoffset;
+ uint32_t old_writeoffset;
+} ringbuffer;
+
+// Commands common across all ring buffers
+typedef enum _rb_cmdopcode
+{
+ // note: don't use 0, because the ring buffer
+ // is initialized to a bunch of 0's that aren't really commands.
+ MIC_RBCT_ERROR = 0x0, // an error has occurred if encountered
+ MIC_RBCT_NOP, // Used to skip empty space in the ringbuffer.
+ MIC_RBCT_DMAEXEC, // DMA buffer to transfer/execute
+ MIC_RBCT_SHUTDOWN, // bus power-down eminent
+ MIC_RBCT_CREATESTDPROCESS, // Launches an executable on the ramdisk.
+ MIC_RBCT_CREATENATIVEPROCESS, // Launches a native process.
+ // NRFIX : not implemented. If native apps are launched by loading shared
+ // libraries(DLLs) into a standard stub app then this command goes away.
+ MIC_RBCT_DESTROYPROCESS, // Destroys a process.
+ MIC_RBCT_VIRTUALALLOC, // Creates a uOS virtual address range
+ MIC_RBCT_MAPHOSTMEMORY, // Used by implement host kernel mode driver services
+ MIC_RBCT_UNMAPHOSTMEMORY, // Unmaps host memory
+ MIC_RBCT_UOSESCAPE, // Used to pass uOS escapes from the host
+ MIC_RBCT_RESERVED1, // Reserved for future use
+ MIC_RBCT_RESERVED2, // Reserved for future use
+ MIC_RBCT_UPLOADSTDAPPLICATION, // Uploads a standard application to the uOS
+ MIC_RBCT_CREATEUOSRESOURCE, // Creates a DPT page cache
+ MIC_RBCT_DESTROYUOSRESOURCE, // Destroys a DPT page cache
+ MIC_RBCT_RESERVE_RING_BANDWIDTH_DBOX_TRAFFIC, // Reserves a ring bandwidth for DBOX traffic
+
+ // Following commands are from MIC->Host (CRBT => CPU ring buffer.)
+ MIC_CRBT_LOG_INFO, // Host logs information sent by the uOS.
+
+ // Always make these the last ones in the list
+#if defined(MIC_DEBUG) || defined(ENABLE_GFXDEBUG)
+ MIC_RBCT_READPHYSICALMEMORY = 0x8000, // Used by debug tools to read memory on the device
+ MIC_RBCT_WRITEPHYSICALMEMORY, // Used by debug tools to write memory on the device
+#endif // defined(MIC_DEBUG) || defined(ENABLE_GFXDEBUG)
+ MIC_RBCT_CMD_MAX // No valid OpCodes above this one
+}ringbuff_cmdop;
+
+typedef struct _ringbuff_cmdhdr
+{
+ ringbuff_cmdop opcode:16;
+ uint32_t size:16;
+}ringbuff_cmdhdr;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//---------------------------------
+// methods used by both
+//---------------------------------
+// initialize cached ring buffer structure
+void rb_initialize(ringbuffer* ringbuff, volatile uint32_t* readptr,
+ volatile uint32_t* writeptr, void *buff, const uint32_t size);
+
+//---------------------------------
+// writer-only methods
+//---------------------------------
+// write a new command. Must follow with fence/MMIO, then RingBufferCommit()
+int rb_write(ringbuffer* ringbuff, ringbuff_cmdhdr* cmd_header);
+// After write(), do an mfence(), an MMIO write to serialize, then Commit()
+void rb_commit(ringbuffer* ringbuff);
+// used on power state change to reset cached pointers
+void rb_reset(ringbuffer* ringbuff);
+// used to determine the largest possible command that could be sent next
+uint32_t rb_get_available_space(ringbuffer* ringbuff);
+
+// TODO: It may be more optimal to have "Reserve" function exposed to the client
+// instead of requiring it to create a command that will be copied into the ring buffer.
+
+
+//---------------------------------
+// reader-only methods
+//---------------------------------
+// uses (updates) the cached read pointer to get the next command, so writer doesn't
+// see the command as consumed
+ringbuff_cmdhdr* rb_get_next_cmd(ringbuffer* ringbuff);
+// updates the control block read pointer, which will be visible to the writer so it
+// can re-use the space
+void rb_update_readptr(ringbuffer* ringbuff, ringbuff_cmdhdr* cmd_header);
+// reader skips all commands, updating its next read offset
+void rb_skip_to_offset(ringbuffer* ringbuff, uint32_t new_readptr);
+
+// uOS used this method to determine if RingBuffer is empty or not before attempting
+// to fetch command out of ring buffer If ringbuffer is empty, means uOS would have
+// fetched it earlier.
+uint32_t rb_empty(ringbuffer* ringbuff);
+
+// only used by host simulator
+void rb_sync(ringbuffer* ringbuff);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __LINUX_GPL__
+//==============================================================================
+// FUNCTION: AlignLow
+//
+// DESCRIPTION: Returns trunk(in_data / in_granularity) * in_granularity
+//
+// PARAMETERS:
+// in_data - Data to be aligned
+// in_granularity - Alignment chunk size - must be a power of 2
+#if defined(__cplusplus)
+template <typename TData>
+#else // no C++
+#define TData uint64_t
+#endif // if C++
+
+static inline TData AlignLow(TData in_data, uintptr_t in_granularity)
+{
+ TData mask = (TData)(in_granularity-1); // 64 -> 0x3f
+
+ // floor to granularity
+ TData low = in_data & ~mask;
+
+ return low;
+}
+
+#if !defined(__cplusplus)
+#undef TData
+#endif // if no C++
+#endif // __LINUX_GPL_
+
+#endif //_MICHOST_RING_BUFFER_DEFINE
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#if !defined(__MIC_COMMON_H)
+#define __MIC_COMMON_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+#include <mic/bootparams.h>
+#include <mic/micsboxdefine.h>
+#include <mic/micdboxdefine.h>
+#include <mic/ringbuffer.h>
+#include <mic/micscif.h>
+#ifdef USE_VCONSOLE
+#include <mic/micvcons.h>
+#endif
+#include <mic/micpsmi.h>
+#include <mic/io_interface.h>
+#include <mic/mic_pm.h>
+#include <mic/mic_dma_api.h>
+#include <mic/micveth_common.h>
+#include <mic/micscif_nm.h>
+
+#define GET_MAX(a, b) ( ((a) > (b)) ? (a) : (b) )
+#define GET_MIN(a, b) ( ((a) < (b)) ? (a) : (b) )
+
+// System Interrupt Cause Read Register 0
+#define SBOX_SICR0_DBR(x) ((x) & 0xf)
+#define SBOX_SICR0_DMA(x) (((x) >> 8) & 0xff)
+
+// System Interrupt Cause Enable Register 0
+#define SBOX_SICE0_DBR(x) ((x) & 0xf)
+#define SBOX_SICE0_DBR_BITS(x) ((x) & 0xf)
+#define SBOX_SICE0_DMA(x) (((x) >> 8) & 0xff)
+#define SBOX_SICE0_DMA_BITS(x) (((x) & 0xff) << 8)
+
+// System Interrupt Cause Read Register 1
+#define SBOX_SICR1_SBOXERR(x) ((x) & 0x1)
+#define SBOX_SICR1_SPIDONE(x) (((x) >> 4) & 0x1)
+
+// System Interrupt Cause Set Register 1
+#define SBOX_SICC1_SBOXERR(x) ((x) & 0x1)
+#define SBOX_SICC1_SPIDONE(x) (((x) >> 4) & 0x1)
+
+// Offsets in the MMIO Range for register segments
+#define HOST_DBOX_BASE_ADDRESS 0x00000000
+#define HOST_SBOX_BASE_ADDRESS 0x00010000
+#define HOST_GTT_BASE_ADDRESS 0x00040000
+
+#define SCRATCH0_MEM_TEST_DISABLE(x) ((x) & 0x1)
+#define SCRATCH0_MEM_USAGE(x) (((x) >> 1) & 0x3)
+#define SCR0_MEM_ALL 0x0
+#define SCR0_MEM_HALF 0x1
+#define SCR0_MEM_THIRD 0x2
+#define SCR0_MEM_FOURTH 0x3
+#define SCRATCH0_MEM_SIZE_KB(x) ((x) >> 0x3)
+
+#define SCRATCH2_DOWNLOAD_STATUS(x) ((x) & 0x1)
+
+#define SCRATCH2_CLEAR_DOWNLOAD_STATUS(x) ((x) & ~0x1)
+#define SCRATCH2_APIC_ID(x) (((x) >> 1) & 0x1ff)
+#define SCRATCH2_DOWNLOAD_ADDR(x) ((x) & 0xfffff000)
+
+#define SCRATCH13_SUB_STEP(x) ((x) & 0xf)
+#define SCRATCH13_STEP_ID(x) (((x) >> 4) & 0xf)
+#define SCRATCH13_PLATFORM_ID(x) (((x) >> 18) & 0x3)
+
+
+#define MEMVOLT_MEMVOLT(x) (((x) >>SHIFT_MEMVOLT) & MASK_MEMVOLT)
+#define MEMFREQ_MEMFREQ(x) (((x) >>SHIFT_MEMORYFREQ) & MASK_MEMORYFREQ)
+#define FAILSAFEOFFSET_FAILSAFE(x) (((x) >>SHIFT_FAIL_SAFE) & MASK_FAIL_SAFE)
+
+#define SCRATCH4_ACTIVE_CORES(x) (((x) >>SHIFT_ACTIVE_CORES) & MASK_ACTIVE_CORES)
+#define SCRATCH0_MEMSIZE(x) (((x) >>SHIFT_MEMSIZE) & MASK_MEMSIZE)
+#define SCRATCH7_FLASHVERSION(x) (((x) >>SHIFT_FLASHVERSION) & MASK_FLASHVERSION)
+#define SCRATCH7_FUSECONFIGREV(x) (((x) >>SHIFT_FUSE_CONFIG_REV) & MASK_FUSE_CONFIG_REV)
+#define SCRATCH13_MODEL(x) (((x) >>SHIFT_MODEL) & MASK_MODEL)
+#define SCRATCH13_FAMILY_DATA(x) (((x) >>SHIFT_FAMILY_DATA) & MASK_FAMILY_DATA)
+#define SCRATCH13_PROCESSOR(x) (((x) >>SHIFT_PROCESSOR) & MASK_PROCESSOR)
+#define SCRATCH13_EXTENDED_MODEL(x) (((x) >>SHIFT_EXTENDED_MODEL) & MASK_EXTENDED_MODEL)
+#define SCRATCH13_EXTENDED_FAMILY(x) (((x) >>SHIFT_EXTENDED_FAMILY) & MASK_EXTENDED_FAMILY)
+
+
+#define DBOX_READ(mmio, offset) \
+ readl((uint32_t*)((uint8_t*)(mmio) + (HOST_DBOX_BASE_ADDRESS + (offset))))
+#define DBOX_WRITE(value, mmio, offset) \
+ writel((value), (uint32_t*)((uint8_t*)(mmio) + (HOST_DBOX_BASE_ADDRESS + (offset))))
+
+#define SBOX_READ(mmio, offset) \
+ readl((uint32_t*)((uint8_t*)(mmio) + (HOST_SBOX_BASE_ADDRESS + (offset))))
+#define SBOX_WRITE(value, mmio, offset) \
+ writel((value), (uint32_t*)((uint8_t*)(mmio) + (HOST_SBOX_BASE_ADDRESS + (offset))))
+
+#define SET_BUS_DEV_FUNC(bus, device, function, reg_offset) \
+ (( bus << 16 ) | ( device << 11 ) | ( function << 8 ) | reg_offset)
+
+#define GTT_READ(mmio, offset) \
+ readl((uint32_t*)((uint8_t*)(mmio) + (HOST_GTT_BASE_ADDRESS + (offset))))
+#define GTT_WRITE(value, mmio, offset) \
+ writel((value), (uint32_t*)((uint8_t*)(mmio) + (HOST_GTT_BASE_ADDRESS + (offset))))
+
+
+#define ENABLE_MIC_INTERRUPTS(mmio) { \
+ uint32_t sboxSice0reg = SBOX_READ((mmio), SBOX_SICE0); \
+ sboxSice0reg |= SBOX_SICE0_DBR_BITS(0xf) | SBOX_SICE0_DMA_BITS(0xff); \
+ SBOX_WRITE(sboxSice0reg, (mmio), SBOX_SICE0); }
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
+#endif
+
+#define DLDR_APT_BAR 0
+#define DLDR_MMIO_BAR 4
+
+#define PCI_VENDOR_INTEL 0x8086
+
+#define PCI_DEVICE_ABR_2249 0x2249
+#define PCI_DEVICE_ABR_224a 0x224a
+
+#define PCI_DEVICE_KNC_2250 0x2250
+#define PCI_DEVICE_KNC_2251 0x2251
+#define PCI_DEVICE_KNC_2252 0x2252
+#define PCI_DEVICE_KNC_2253 0x2253
+#define PCI_DEVICE_KNC_2254 0x2254
+#define PCI_DEVICE_KNC_2255 0x2255
+#define PCI_DEVICE_KNC_2256 0x2256
+#define PCI_DEVICE_KNC_2257 0x2257
+#define PCI_DEVICE_KNC_2258 0x2258
+#define PCI_DEVICE_KNC_2259 0x2259
+#define PCI_DEVICE_KNC_225a 0x225a
+
+#define PCI_DEVICE_KNC_225b 0x225b
+#define PCI_DEVICE_KNC_225c 0x225c
+#define PCI_DEVICE_KNC_225d 0x225d
+#define PCI_DEVICE_KNC_225e 0x225e
+
+#define MIC_CMDLINE_BUFSIZE 1024
+#define RESET_FAIL_TIME 300
+
+/* Masks for sysfs entries */
+#ifdef CONFIG_ML1OM
+#define MASK_COREVOLT 0xff
+#define MASK_COREFREQ 0xfff
+#endif
+#define MASK_MEMVOLT 0xff
+#define MASK_MEMORYFREQ 0xff
+#define MASK_MEMSIZE 0x1fffffff
+#define MASK_FLASHVERSION 0xffff
+#define MASK_SUBSTEPPING_DATA 0xf
+#define MASK_STEPPING_DATA 0xf
+#define MASK_MODEL 0xf
+#define MASK_FAMILY_DATA 0xf
+#define MASK_PROCESSOR 0x3
+#define MASK_PLATFORM 0x3
+#define MASK_EXTENDED_MODEL 0xf
+#define MASK_EXTENDED_FAMILY 0xff
+#define MASK_FUSE_CONFIG_REV 0x3ff
+#define MASK_ACTIVE_CORES 0x3f
+#define MASK_FAIL_SAFE 0xffffffff
+#define MASK_FLASH_UPDATE 0xffffffff
+/* Shifts for sysfs entries */
+#ifdef CONFIG_ML1OM
+#define SHIFT_COREVOLT 0
+#define SHIFT_COREFREQ 0
+#endif
+#define SHIFT_MEMVOLT 0
+#define SHIFT_MEMORYFREQ 0
+#define SHIFT_MEMSIZE 3
+#define SHIFT_FLASHVERSION 16
+#define SHIFT_SUBSTEPPING_DATA 0
+#define SHIFT_STEPPING_DATA 4
+#define SHIFT_MODEL 8
+#define SHIFT_FAMILY_DATA 12
+#define SHIFT_PROCESSOR 16
+#define SHIFT_PLATFORM 18
+#define SHIFT_EXTENDED_MODEL 20
+#define SHIFT_EXTENDED_FAMILY 24
+#define SHIFT_FUSE_CONFIG_REV 0
+#define SHIFT_ACTIVE_CORES 10
+#define SHIFT_FAIL_SAFE 0
+#define SHIFT_FLASH_UPDATE 0
+
+#define SKU_NAME_LEN 20
+
+/* Should be updated to reflect the latest interface version in sysfs and wmi property */
+#define LINUX_INTERFACE_VERSION "1.0"
+#define WINDOWS_INTERFACE_VERSION "1.0"
+
+typedef enum mic_modes
+{
+ MODE_NONE,
+ MODE_LINUX,
+ MODE_ELF,
+ MODE_FLASH
+} MIC_MODES;
+
+typedef enum mic_status
+{
+ MIC_READY,
+ MIC_BOOT,
+ MIC_NORESPONSE,
+ MIC_BOOTFAIL,
+ MIC_ONLINE,
+ MIC_SHUTDOWN,
+ MIC_LOST,
+ MIC_RESET,
+ MIC_RESETFAIL,
+ MIC_INVALID
+} MIC_STATUS;
+
+typedef enum _product_platform_t
+{
+ PLATFORM_SILICON = 0,
+ PLATFORM_EMULATOR = 2,
+}product_platform_t;
+
+
+typedef enum _platform_resource_type
+{
+ PCI_APERTURE,
+ MMIO,
+ MAX_RESOURCE_TYPE
+}platform_resource_type;
+
+typedef struct _platform_resource_t
+{
+ uint8_t* va; // mapped by driver
+ uint64_t pa; // from PCI config space
+ uint64_t len;// from PCI config space
+}platform_resource_t;
+
+
+typedef struct micscifhost_info {
+ dma_addr_t si_pa;
+ struct delayed_work si_bs_check;
+ uint32_t si_bs_wait_count;
+} scifhost_info_t;
+
+#define MIC_NUM_DB 4
+typedef struct mic_irq {
+ spinlock_t mi_lock;
+ struct list_head mi_dblist[MIC_NUM_DB]; // The 4 doorbell interrupts.
+ atomic_t mi_received;
+} mic_irq_t;
+
+typedef struct sysfs_info {
+ char *cmdline;
+ char *kernel_cmdline;
+} sysfs_info_t;
+
+typedef struct pm_recv_msg {
+ struct list_head msg;
+ pm_msg_header msg_header;
+ void * msg_body;
+} pm_recv_msg_t;
+
+typedef struct pm_wq {
+ struct workqueue_struct *wq;
+ struct work_struct work;
+ char wq_name[20];
+} pm_wq_t;
+
+/*
+ * Driver wide power management context
+ * common power management context for all the devices
+ */
+typedef struct micscif_pm {
+ scif_epd_t epd;
+ atomic_t connected_clients;
+ pm_wq_t accept;
+ struct mutex pm_accept_mutex;
+ struct mutex pm_idle_mutex;
+ struct dentry *pmdbgparent_dir;
+ uint32_t enable_pm_logging;
+ atomic_t wakeup_in_progress;
+ uint8_t *nodemask;
+ uint32_t nodemask_len;
+} micscif_pm_t;
+
+/* per device power management context */
+typedef struct micpm_ctx
+{
+ scif_epd_t pm_epd;
+ PM_IDLE_STATE idle_state;
+ struct mutex msg_mutex;
+ struct list_head msg_list;
+ uint32_t pc6_timeout;
+ struct work_struct pm_close;
+ MIC_STATUS mic_suspend_state;
+ bool pc3_enabled;
+ bool pc6_enabled;
+ pm_msg_pm_options pm_options;
+ atomic_t pm_ref_cnt;
+ platform_resource_t nodemask;
+ pm_wq_t recv;
+ pm_wq_t handle_msg;
+ pm_wq_t resume;
+ struct workqueue_struct *pc6_entry_wq;
+ struct delayed_work pc6_entry_work;
+ char pc6_wq_name[20];
+ struct dentry *pmdbg_dir;
+ PM_CONNECTION_STATE con_state;
+ wait_queue_head_t disc_wq;
+} micpm_ctx_t;
+
+typedef struct _mic_ctx_t {
+ platform_resource_t mmio;
+ platform_resource_t aper;
+ uint32_t apic_id;
+ uint32_t msie;
+ ringbuffer ringbuff[MIC_ENG_MAX_SUPPORTED_ENGINES];
+ uint32_t rb_readoff __attribute__((aligned(64)));
+ micpm_ctx_t micpm_ctx;
+ CARD_USAGE_MODE card_usage_mode;
+ uint64_t adptr_base_pa;
+
+ int32_t bi_id;
+ mic_irq_t bi_irq;
+ struct tasklet_struct bi_dpc;
+ scifhost_info_t bi_scif;
+#ifdef USE_VCONSOLE
+ micvcons_t bi_vcons;
+#endif
+ void *bi_vethinfo;
+ struct mic_psmi_ctx bi_psmi;
+ struct pci_dev *bi_pdev;
+
+ MIC_STATUS state;
+ struct mutex state_lock;
+ MIC_MODES mode;
+ wait_queue_head_t resetwq;
+ char *image;
+ char *initramfs;
+ struct timer_list boot_timer;
+ unsigned long boot_start;
+ struct work_struct boot_ws;
+
+ struct workqueue_struct *resetworkq;
+ struct work_struct resetwork;
+ struct workqueue_struct *ioremapworkq;
+ struct work_struct ioremapwork;
+ wait_queue_head_t ioremapwq;
+ uint32_t reset_count;
+
+ atomic_t bi_irq_received;
+ uint8_t bi_stepping;
+ uint8_t bi_substepping;
+ product_platform_t bi_platform;
+ product_family_t bi_family;
+ struct board_info *bd_info;
+ sysfs_info_t sysfs_info;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0))
+ struct kernfs_node *sysfs_state;
+#else
+ struct sysfs_dirent *sysfs_state;
+#endif
+ spinlock_t sysfs_lock;
+ mic_dma_handle_t dma_handle;
+ uint32_t boot_mem;
+ mic_smpt_t *mic_smpt;
+ spinlock_t smpt_lock;
+ uint32_t sdbic1;
+ int64_t etc_comp;
+ spinlock_t ramoops_lock;
+ void *ramoops_va[2];
+ int ramoops_size;
+ dma_addr_t ramoops_pa[2];
+ struct proc_dir_entry *ramoops_dir;
+ struct proc_dir_entry *vmcore_dir;
+ /*
+ * List representing chunks of contiguous memory areas and
+ * their offsets in vmcore file.
+ */
+ struct list_head vmcore_list;
+ /* Stores the pointer to the buffer containing kernel elf core headers */
+ char *elfcorebuf;
+ size_t elfcorebuf_sz;
+ /* Total size of vmcore file. */
+ uint64_t vmcore_size;
+ int crash_count;
+ int boot_count;
+ void *log_buf_addr;
+ int *log_buf_len;
+ char sku_name[SKU_NAME_LEN];
+ atomic_t disconn_rescnt;
+ atomic_t gate_interrupt;
+ uint16_t numa_node;
+} mic_ctx_t;
+
+
+typedef struct mic_irqhander {
+ int (*ih_func)(mic_ctx_t *mic_ctx, int doorbell);
+ struct list_head ih_list;
+ char *ih_idstring;
+} mic_irqhandler_t;
+
+/* SKU related definitions and declarations */
+#define MAX_DEV_IDS 16
+typedef struct sku_info {
+ uint32_t fuserev_low;
+ uint32_t fuserev_high;
+ uint32_t memsize;
+ uint32_t memfreq;
+ char sku_name[SKU_NAME_LEN];
+ struct list_head sku;
+} sku_info_t;
+
+int sku_create_node(uint32_t fuserev_low,
+ uint32_t fuserev_high, uint32_t mem_size,
+ uint32_t mem_freq, char *sku_name,
+ sku_info_t ** newnode);
+
+int sku_build_table(void);
+void sku_destroy_table(void);
+int sku_find(mic_ctx_t *mic_ctx, uint32_t device_id);
+
+/* End SKU related definitions and declarations */
+
+#define MIC_NUM_MSIX_ENTRIES 1
+typedef struct mic_data {
+ int32_t dd_numdevs;
+ int32_t dd_inuse;
+#ifdef USE_VCONSOLE
+ micvcons_port_t dd_ports[MAX_BOARD_SUPPORTED];
+#endif
+ struct board_info *dd_bi[MAX_BOARD_SUPPORTED];
+ struct list_head dd_bdlist;
+ micscif_pm_t dd_pm;
+ uint64_t sysram;
+ struct fasync_struct *dd_fasync;
+ struct list_head sku_table[MAX_DEV_IDS];
+} mic_data_t;
+
+#include "mic_interrupts.h"
+extern mic_data_t mic_data;
+extern struct micscif_dev scif_dev[];
+
+typedef struct acptboot_data {
+ scif_epd_t listen_epd;
+ uint16_t acptboot_pn;
+ struct workqueue_struct *acptbootwq;
+ struct work_struct acptbootwork;
+}acptboot_data_t;
+
+void acptboot_exit(void);
+int acptboot_init(void);
+void adapter_init(void);
+int adapter_isr(mic_ctx_t *mic_ctx);
+int adapter_imsr(mic_ctx_t *mic_ctx);
+int adapter_remove(mic_ctx_t *mic_ctx);
+int adapter_do_ioctl(uint32_t cmd, uint64_t arg);
+int adapter_stop_device(mic_ctx_t *mic_ctx, int wait_reset, int reattempt);
+int adapter_shutdown_device(mic_ctx_t *mic_ctx);
+void calculate_etc_compensation(mic_ctx_t *mic_ctx);
+int adapter_probe(mic_ctx_t *mic_ctx);
+int adapter_post_boot_device(mic_ctx_t *mic_ctx);
+int adapter_start_device(mic_ctx_t *mic_ctx);
+int adapter_restart_device(mic_ctx_t *mic_ctx);
+int adapter_init_device(mic_ctx_t *mic_ctx);
+int pm_adapter_do_ioctl(mic_ctx_t *mic_ctx, void *in_buffer);
+int adapter_reset_depgraph(mic_ctx_t *mic_ctx);
+
+/*
+ * RESET_WAIT : launch the timer thread and wait for reset to complete
+ * The caller has to add itself to the resetwq by calling wait_for_reset
+ * RESET_REATTEMPT : Reattempt reset after detecting failures in reset
+ */
+#define RESET_WAIT 1
+#define RESET_REATTEMPT 1
+void adapter_reset(mic_ctx_t *mic_ctx, int wait_reset, int reattempt);
+
+void adapter_wait_reset(mic_ctx_t *mic_ctx);
+void get_adapter_memsize(uint8_t *mmio_va, uint32_t *adapter_mem_size);
+int wait_for_bootstrap(uint8_t *mmio_va);
+void post_boot_startup(struct work_struct *work);
+void attempt_reset(struct work_struct *work);
+
+int send_uos_escape(mic_ctx_t *mic_ctx, uint32_t uos_op,
+ uint32_t data_size, void *escape_data);
+int boot_linux_uos(mic_ctx_t *mic_ctx, char *imgname, char *initramfsname);
+
+int boot_micdev_app(mic_ctx_t *mic_ctx, char *imgname);
+int allocate_tools_buffer(mic_ctx_t *mic_ctx, uint32_t databuf_size,
+ uint32_t stsbuf_size, uint64_t *gddr_data_ptr,
+ uint64_t *gddr_stsbuf_ptr);
+
+int micpm_init(void);
+void micpm_uninit(void);
+int micpm_stop(mic_ctx_t *mic_ctx);
+int micpm_start(mic_ctx_t *mic_ctx);
+int micpm_probe(mic_ctx_t *mic_ctx);
+int micpm_remove(mic_ctx_t *mic_ctx);
+void micpm_nodemask_uninit(mic_ctx_t* mic_ctx);
+int micpm_nodemask_init(uint32_t num_devs, mic_ctx_t* mic_ctx);
+int micpm_disconn_init(uint32_t num_nodes);
+int micpm_disconn_uninit(uint32_t num_nodes);
+int micpm_dbg_init(mic_ctx_t *mic_ctx);
+void micpm_dbg_parent_init(void);
+int pm_reg_read(mic_ctx_t *mic_ctx, uint32_t regoffset);
+int micpm_update_pc6(mic_ctx_t *mic_ctx, bool set);
+int micpm_update_pc3(mic_ctx_t *mic_ctx, bool set);
+int pm_start_device(mic_ctx_t *mic_ctx);
+int pm_stop_device(mic_ctx_t *mic_ctx);
+int mic_pm_recv(mic_ctx_t *mic_ctx, void *msg, uint32_t len);
+int mic_pm_send_msg(mic_ctx_t *mic_ctx, PM_MESSAGE type,
+ void *msg, uint32_t len);
+
+int pm_pc3_entry(mic_ctx_t *mic_ctx);
+int pm_pc3_exit(mic_ctx_t *mic_ctx);
+int do_idlestate_entry(mic_ctx_t *mic_ctx);
+int do_idlestate_exit(mic_ctx_t *mic_ctx, bool get_ref);
+int is_idlestate_exit_needed(mic_ctx_t *mic_ctx);
+uint32_t mic_get_scifnode_id(mic_ctx_t *mic_ctx);
+
+mic_ctx_t* get_per_dev_ctx(uint16_t node);
+int get_num_devs(mic_ctx_t *mic_ctx, uint32_t *num_devs);
+
+
+void adapter_uninit(void);
+void adapter_add(mic_ctx_t *mic_ctx);
+void adapter_start(mic_ctx_t *mic_ctx);
+int send_flash_cmd(mic_ctx_t *mic_ctx, MIC_FLASH_CMD_TYPE type, void *data,
+ uint32_t len);
+int cmdline_mem(mic_ctx_t *mic_ctx, uint32_t mem);
+int get_cardside_mem(mic_ctx_t *mic_ctx, uint64_t start, uint64_t size, void *dest);
+
+int mic_pin_user_pages (void *data, struct page **pages, uint32_t len, int32_t *nf_pages, int32_t nr_pages);
+int mic_unpin_user_pages(struct page **pages, uint32_t nf_pages);
+product_family_t get_product_family(uint32_t device_id);
+void show_stepping_comm(mic_ctx_t *mic_ctx,char *buf);
+void micscif_destroy_p2p(mic_ctx_t *mic_ctx);
+
+#ifdef HOST
+void mic_smpt_init(mic_ctx_t *mic_ctx);
+void mic_smpt_restore(mic_ctx_t *mic_ctx);
+#endif
+void mic_smpt_uninit(mic_ctx_t *mic_ctx);
+int mic_dma_init(void);
+
+#ifndef _MIC_SCIF_
+static __always_inline int micpm_get_reference(mic_ctx_t *mic_ctx, bool force_wakeup) {
+ int err;
+ if (!mic_ctx)
+ return -EINVAL;
+
+ if (mic_ctx->micpm_ctx.idle_state == PM_IDLE_STATE_LOST)
+ return -ENODEV;
+
+ if (unlikely(!atomic_add_unless(&mic_ctx->micpm_ctx.pm_ref_cnt,
+ 1, PM_NODE_IDLE))) {
+ if (!force_wakeup) {
+ if (is_idlestate_exit_needed(mic_ctx)) {
+ return -EAGAIN;
+ }
+ }
+
+ if ((err = micscif_connect_node(mic_get_scifnode_id(mic_ctx), true)) != 0)
+ return -ENODEV;
+ }
+ return 0;
+}
+#endif
+
+static __always_inline int micpm_put_reference(mic_ctx_t *mic_ctx) {
+ int ret;
+
+ if(!mic_ctx)
+ return -EINVAL;
+
+ if (mic_ctx->micpm_ctx.idle_state == PM_IDLE_STATE_LOST)
+ return -ENODEV;
+
+ if (unlikely((ret = atomic_sub_return(1,
+ &mic_ctx->micpm_ctx.pm_ref_cnt)) < 0)) {
+ printk(KERN_ERR "%s %d Invalid PM ref_cnt %d \n",
+ __func__, __LINE__, atomic_read(&mic_ctx->micpm_ctx.pm_ref_cnt));
+ }
+
+ return 0;
+
+}
+
+static __always_inline int
+mic_hw_family(int node_id) {
+ mic_ctx_t *mic_ctx;
+
+ /* For Host Loopback */
+ if (!node_id)
+ return -EINVAL;
+
+ mic_ctx = get_per_dev_ctx(node_id - 1);
+ return mic_ctx->bi_family;
+}
+
+static __always_inline void
+wait_for_reset(mic_ctx_t *mic_ctx)
+{
+ int ret = 0;
+ while (!ret) {
+ ret = wait_event_timeout(mic_ctx->resetwq,
+ mic_ctx->state != MIC_RESET, RESET_FAIL_TIME * HZ);
+ }
+}
+
+/* Called only by host PM suspend */
+static __always_inline int
+wait_for_shutdown_and_reset(mic_ctx_t *mic_ctx)
+{
+ int ret;
+ ret = wait_event_interruptible_timeout(mic_ctx->resetwq,
+ mic_ctx->state != MIC_RESET && mic_ctx->state != MIC_SHUTDOWN,
+ RESET_FAIL_TIME * HZ);
+ return ret;
+}
+
+static __always_inline void
+mic_signal_daemon(void)
+{
+ if (mic_data.dd_fasync != NULL)
+ kill_fasync(&mic_data.dd_fasync, SIGIO, POLL_IN);
+}
+
+extern char *micstates[];
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+#define __mic_create_singlethread_workqueue(name) alloc_ordered_workqueue(name, 0)
+#else
+#define __mic_create_singlethread_workqueue(name) create_singlethread_workqueue(name)
+#endif
+
+static __always_inline void
+mic_setstate(mic_ctx_t *mic_ctx, enum mic_status newstate)
+{
+ printk("mic%d: Transition from state %s to %s\n", mic_ctx->bi_id,
+ micstates[mic_ctx->state], micstates[newstate]);
+ mic_ctx->state = newstate;
+ spin_lock_bh(&mic_ctx->sysfs_lock);
+ if (mic_ctx->sysfs_state)
+ sysfs_notify_dirent(mic_ctx->sysfs_state);
+ spin_unlock_bh(&mic_ctx->sysfs_lock);
+}
+
+#define MICREG_POSTCODE 0x242c
+
+static __always_inline uint32_t
+mic_getpostcode(mic_ctx_t *mic_ctx)
+{
+ return DBOX_READ(mic_ctx->mmio.va, MICREG_POSTCODE);
+}
+
+static __always_inline int
+mic_hw_stepping(int node_id) {
+ mic_ctx_t *mic_ctx;
+
+ /* For Host Loopback */
+ if (!node_id)
+ return -EINVAL;
+
+ mic_ctx = get_per_dev_ctx(node_id - 1);
+ return mic_ctx->bi_stepping;
+}
+
+#define MIC_IRQ_DB0 0
+#define MIC_IRQ_DB1 1
+#define MIC_IRQ_DB2 2
+#define MIC_IRQ_DB3 3
+#define MIC_IRQ_MAX MIC_IRQ_DB3
+
+int mic_reg_irqhandler(mic_ctx_t *mic_ctx, int doorbell, char *idstring,
+ int (*irqfunc)(mic_ctx_t *mic_ctx, int doorbell));
+int mic_unreg_irqhandler(mic_ctx_t *mic_ctx, int doorbell, char *idstring);
+void mic_enable_interrupts(mic_ctx_t *mic_ctx);
+void mic_disable_interrupts(mic_ctx_t *mic_ctx);
+void mic_enable_msi_interrupts(mic_ctx_t *mic_ctx);
+
+int micscif_init(void);
+void micscif_destroy(void);
+void micscif_probe(mic_ctx_t *mic_ctx);
+void micscif_remove(mic_ctx_t *mic_ctx);
+void micscif_start(mic_ctx_t *mic_ctx);
+void micscif_stop(mic_ctx_t *mic_ctx);
+
+mic_ctx_t *get_device_context(struct pci_dev *dev);
+void ramoops_exit(void);
+void vmcore_exit(void);
+int vmcore_create(mic_ctx_t *mic_ctx);
+void vmcore_remove(mic_ctx_t *mic_ctx);
+
+// loads file into memory
+int mic_get_file_size(const char *path, uint32_t *file_length);
+int mic_load_file(const char *fn, uint8_t *buffer, uint32_t max_size);
+#ifndef _MIC_SCIF_
+void mic_debug_init(mic_ctx_t *mic_ctx);
+#endif
+void mic_debug_uninit(void);
+void
+set_pci_aperture(mic_ctx_t *mic_ctx, uint32_t gtt_index, uint64_t phy_addr, uint32_t num_bytes);
+#ifdef __cplusplus
+};
+#endif
+
+#endif // __MIC_COMMON_H
+
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "mic_common.h"
+
+/* vnet/mic_shutdown/hvc/virtio */
+#define VNET_SBOX_INT_IDX 0
+#define MIC_SHT_SBOX_INT_IDX 1
+#define HVC_SBOX_INT_IDX 2
+#define VIRTIO_SBOX_INT_IDX 3
+#define PM_SBOX_INT_IDX 4
+
+#define MIC_BSP_INTERRUPT_VECTOR 229 // Host->Card(bootstrap) Interrupt Vector#
+/*
+ * Current usage of MIC interrupts:
+ * APICICR1 - mic shutdown interrupt
+ * APCICR0 - rest
+ *
+ * Planned Usage:
+ * SCIF - rdmasrs
+ * vnet/hvc/virtio - APICICR0
+ * mic shutdown interrupt - APICICR1
+ */
+static void __mic_send_intr(mic_ctx_t *mic_ctx, int i)
+{
+ uint32_t apicicr_low;
+ uint64_t apic_icr_offset = SBOX_APICICR0 + i * 8;
+
+ apicicr_low = SBOX_READ(mic_ctx->mmio.va, apic_icr_offset);
+ /* for KNC we need to make sure we "hit" the send_icr bit (13) */
+ if (mic_ctx->bi_family == FAMILY_KNC)
+ apicicr_low = (apicicr_low | (1 << 13));
+
+ /* MIC card only triggers when we write the lower part of the
+ * address (upper bits)
+ */
+ SBOX_WRITE(apicicr_low, mic_ctx->mmio.va, apic_icr_offset);
+}
+
+static inline void mic_send_vnet_intr(mic_ctx_t *mic_ctx)
+{
+ __mic_send_intr(mic_ctx, VNET_SBOX_INT_IDX);
+}
+
+static inline void mic_send_hvc_intr(mic_ctx_t *mic_ctx)
+{
+ __mic_send_intr(mic_ctx, HVC_SBOX_INT_IDX);
+}
+
+static inline void mic_send_scif_intr(mic_ctx_t *mic_ctx)
+{
+ __mic_send_intr(mic_ctx, 0);
+}
+
+static inline void mic_send_virtio_intr(mic_ctx_t *mic_ctx)
+{
+ __mic_send_intr(mic_ctx, VIRTIO_SBOX_INT_IDX);
+}
+
+static inline void mic_send_sht_intr(mic_ctx_t *mic_ctx)
+{
+ __mic_send_intr(mic_ctx, 1);
+}
+
+static inline void mic_send_pm_intr(mic_ctx_t *mic_ctx)
+{
+ __mic_send_intr(mic_ctx, PM_SBOX_INT_IDX);
+}
+
+static inline void mic_send_bootstrap_intr(mic_ctx_t *mic_ctx)
+{
+ uint32_t apicicr_low;
+ uint64_t apic_icr_offset = SBOX_APICICR7;
+ int vector = MIC_BSP_INTERRUPT_VECTOR;
+
+ if (mic_ctx->bi_family == FAMILY_ABR){
+ apicicr_low = vector;
+ } else {
+ /* for KNC we need to make sure we "hit" the send_icr bit (13) */
+ apicicr_low = (vector | (1 << 13));
+ }
+
+ SBOX_WRITE(mic_ctx->apic_id, mic_ctx->mmio.va, apic_icr_offset + 4);
+ // MIC card only triggers when we write the lower part of the address (upper bits)
+ SBOX_WRITE(apicicr_low, mic_ctx->mmio.va, apic_icr_offset);
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MICINT_H
+#define MICINT_H
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/fs.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/capability.h>
+#include <linux/uio.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/version.h>
+#include <asm/io.h>
+#include <asm/ioctl.h>
+#include <asm/uaccess.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/rtnetlink.h>
+#include <linux/pm.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+#include <linux/ctype.h>
+#include <linux/sysfs.h>
+
+#include "mic_common.h"
+#include <mic/micscif.h>
+
+#define MAX_DLDR_MINORS 68
+typedef struct mic_lindata {
+ dev_t dd_dev;
+ struct cdev dd_cdev;
+ struct device *dd_hostdev;
+ struct device *dd_scifdev;
+ struct class *dd_class;
+ struct pci_driver dd_pcidriver;
+}mic_lindata_t;
+
+typedef struct board_info {
+ struct device *bi_sysfsdev;
+#ifdef CONFIG_PCI_MSI
+ struct msix_entry bi_msix_entries[MIC_NUM_MSIX_ENTRIES];
+#endif
+#ifdef USE_VCONSOLE
+ micvcons_port_t *bi_port;
+#endif
+ void *bi_virtio; /* for virtio */
+
+ struct list_head bi_list;
+ mic_ctx_t bi_ctx;
+} bd_info_t;
+
+extern mic_lindata_t mic_lindata;
+
+#ifdef USE_VCONSOLE
+int micvcons_create(int num_bds);
+void micvcons_destroy(int num_bds);
+#endif
+
+int micpm_suspend(struct device *pdev);
+int micpm_resume(struct device *pdev);
+int micpm_suspend_noirq(struct device *pdev);
+int micpm_resume_noirq(struct device *pdev);
+int micpm_notifier_block(struct notifier_block *nb, unsigned long event, void *dummy);
+irqreturn_t mic_irq_isr(int irq, void *data);
+
+int mic_psmi_init(mic_ctx_t *mic_ctx);
+void mic_psmi_uninit(mic_ctx_t *mic_ctx);
+
+void set_sysfs_entries(mic_ctx_t *mic_ctx);
+void free_sysfs_entries(mic_ctx_t *mic_ctx);
+#endif // MICINT_H
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * Revised 15:05 11/24/2010
+ * Derived from SCIF SAS v0.41 with additional corrections
+ */
+
+#ifndef __SCIF_H__
+#define __SCIF_H__
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/poll.h>
+#include <linux/pci.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SCIF_ACCEPT_SYNC 1
+#define SCIF_SEND_BLOCK 1
+#define SCIF_RECV_BLOCK 1
+
+/* Start: Deprecated Temporary definition for compatability */
+#define ACCEPT_SYNC SCIF_ACCEPT_SYNC
+#define SEND_BLOCK SCIF_SEND_BLOCK
+#define RECV_BLOCK SCIF_RECV_BLOCK
+/* End: Deprecated Temporary definition for compatability */
+
+enum {
+ SCIF_PROT_READ = (1<<0),
+ SCIF_PROT_WRITE = (1<<1)
+};
+
+/* 0x40 is used internally by scif */
+enum {
+ SCIF_MAP_FIXED = 0x10,
+ SCIF_MAP_KERNEL = 0x20,
+};
+
+enum {
+ SCIF_FENCE_INIT_SELF = (1<<0),
+ SCIF_FENCE_INIT_PEER = (1<<1)
+};
+
+enum {
+ SCIF_FENCE_RAS_SELF = (1<<2),
+ SCIF_FENCE_RAS_PEER = (1<<3)
+};
+
+enum {
+ SCIF_SIGNAL_LOCAL = (1<<4),
+ SCIF_SIGNAL_REMOTE = (1<<5)
+};
+
+#define SCIF_RMA_USECPU 1
+#define SCIF_RMA_USECACHE (1<<1)
+#define SCIF_RMA_SYNC (1<<2)
+#define SCIF_RMA_ORDERED (1<<3)
+//! @cond (Prevent doxygen from including these)
+#define SCIF_POLLIN POLLIN
+#define SCIF_POLLOUT POLLOUT
+#define SCIF_POLLERR POLLERR
+#define SCIF_POLLHUP POLLHUP
+#define SCIF_POLLNVAL POLLNVAL
+
+/* SCIF Reserved Ports */
+/* COI */
+#define SCIF_COI_PORT_0 40
+#define SCIF_COI_PORT_1 41
+#define SCIF_COI_PORT_2 42
+#define SCIF_COI_PORT_3 43
+#define SCIF_COI_PORT_4 44
+#define SCIF_COI_PORT_5 45
+#define SCIF_COI_PORT_6 46
+#define SCIF_COI_PORT_7 47
+#define SCIF_COI_PORT_8 48
+#define SCIF_COI_PORT_9 49
+
+/* OFED */
+#define SCIF_OFED_PORT_0 60
+#define SCIF_OFED_PORT_1 61
+#define SCIF_OFED_PORT_2 62
+#define SCIF_OFED_PORT_3 63
+#define SCIF_OFED_PORT_4 64
+#define SCIF_OFED_PORT_5 65
+#define SCIF_OFED_PORT_6 66
+#define SCIF_OFED_PORT_7 67
+#define SCIF_OFED_PORT_8 68
+#define SCIF_OFED_PORT_9 69
+
+/* NETDEV */
+#define SCIF_NETDEV_PORT_0 80
+#define SCIF_NETDEV_PORT_1 81
+#define SCIF_NETDEV_PORT_2 82
+#define SCIF_NETDEV_PORT_3 83
+#define SCIF_NETDEV_PORT_4 84
+#define SCIF_NETDEV_PORT_5 85
+#define SCIF_NETDEV_PORT_6 86
+#define SCIF_NETDEV_PORT_7 87
+#define SCIF_NETDEV_PORT_8 88
+#define SCIF_NETDEV_PORT_9 89
+
+/* RAS */
+#define SCIF_RAS_PORT_0 100
+#define SCIF_RAS_PORT_1 101
+#define SCIF_RAS_PORT_2 102
+#define SCIF_RAS_PORT_3 103
+#define SCIF_RAS_PORT_4 104
+#define SCIF_RAS_PORT_5 105
+#define SCIF_RAS_PORT_6 106
+#define SCIF_RAS_PORT_7 107
+#define SCIF_RAS_PORT_8 108
+#define SCIF_RAS_PORT_9 109
+
+/* Power Management */
+#define SCIF_PM_PORT_0 120
+#define SCIF_PM_PORT_1 121
+#define SCIF_PM_PORT_2 122
+#define SCIF_PM_PORT_3 123
+#define SCIF_PM_PORT_4 124
+#define SCIF_PM_PORT_5 125
+#define SCIF_PM_PORT_6 126
+#define SCIF_PM_PORT_7 127
+#define SCIF_PM_PORT_8 128
+#define SCIF_PM_PORT_9 129
+
+/* Board Tools */
+#define SCIF_BT_PORT_0 130
+#define SCIF_BT_PORT_1 131
+#define SCIF_BT_PORT_2 132
+#define SCIF_BT_PORT_3 133
+#define SCIF_BT_PORT_4 134
+#define SCIF_BT_PORT_5 135
+#define SCIF_BT_PORT_6 136
+#define SCIF_BT_PORT_7 137
+#define SCIF_BT_PORT_8 138
+#define SCIF_BT_PORT_9 139
+
+/* MIC Boot/Configuration support */
+#define MPSSD_MONRECV 160
+#define MIC_NOTIFY 161
+#define MPSSD_CRED 162
+#define MPSSD_MONSEND 163
+#define MPSSD_MICCTRL 164
+#define MPSSD_RESV5 165
+#define MPSSD_RESV6 166
+#define MPSSD_RESV7 167
+#define MPSSD_RESV8 168
+#define MPSSD_RESV9 169
+
+#define SCIF_ADMIN_PORT_END 1024
+
+/* MYO */
+#define SCIF_MYO_PORT_0 1025
+#define SCIF_MYO_PORT_1 1026
+#define SCIF_MYO_PORT_2 1027
+#define SCIF_MYO_PORT_3 1028
+#define SCIF_MYO_PORT_4 1029
+#define SCIF_MYO_PORT_5 1030
+#define SCIF_MYO_PORT_6 1031
+#define SCIF_MYO_PORT_7 1032
+#define SCIF_MYO_PORT_8 1033
+#define SCIF_MYO_PORT_9 1034
+
+/* SSG Tools */
+#define SCIF_ST_PORT_0 1044
+#define SCIF_ST_PORT_1 1045
+#define SCIF_ST_PORT_2 1046
+#define SCIF_ST_PORT_3 1047
+#define SCIF_ST_PORT_4 1048
+#define SCIF_ST_PORT_5 1049
+#define SCIF_ST_PORT_6 1050
+#define SCIF_ST_PORT_7 1051
+#define SCIF_ST_PORT_8 1052
+#define SCIF_ST_PORT_9 1053
+
+/* End of SCIF Reserved Ports */
+#define SCIF_PORT_RSVD 1088
+//! @endcond
+
+typedef struct endpt *scif_epd_t;
+
+typedef struct scif_pinned_pages *scif_pinned_pages_t;
+
+struct scif_range {
+ void *cookie; /* cookie */
+ int nr_pages; /* Number of Pages */
+ int prot_flags; /* R/W protection */
+ /* Arrays phys_addr/va below are virtually contiguous */
+ dma_addr_t *phys_addr; /* Array of physical addresses */
+ void **va; /* Array of virtual addresses
+ * and populated only when called
+ * on the host for a remote SCIF
+ * connection on MIC.
+ */
+};
+
+struct scif_pollepd {
+ scif_epd_t epd; /* endpoint descriptor */
+ short events; /* requested events */
+ short revents; /* returned events */
+};
+enum scif_event_type {
+ SCIF_NODE_ADDED = 1<<0,
+ SCIF_NODE_REMOVED = 1<<1
+};
+
+union eventd {
+ uint16_t scif_node_added;
+ uint16_t scif_node_removed;
+};
+
+typedef void (*scif_callback_t)(enum scif_event_type event, union eventd
+data);
+
+struct scif_callback {
+ struct list_head list_member;
+ scif_callback_t callback_handler;
+};
+
+#define SCIF_OPEN_FAILED ((scif_epd_t)-1)
+#define SCIF_REGISTER_FAILED ((off_t)-1)
+#define SCIF_MMAP_FAILED ((void *)-1)
+
+struct scif_portID {
+ uint16_t node; /* node on which port resides */
+ uint16_t port; /* Local port number */
+};
+
+/* Start: Deprecated Temporary definition for compatability */
+#define portID scif_portID
+typedef struct portID portID_t;
+/* End: Deprecated Temporary definition for compatability */
+
+/**
+ * scif_open - Create an endpoint
+ *
+ *\return
+ * The scif_open() function creates a new endpoint.
+ *
+ * Upon successful completion, scif_open() returns an endpoint descriptor to
+ * be used in subsequent SCIF functions calls to refer to that endpoint;
+ * otherwise: in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is
+ * returned and errno is set to indicate the error; in kernel mode a NULL
+ * scif_epd_t is returned.
+ *
+ *\par Errors:
+ *- ENOMEM
+ * - Insufficient kernel memory was available.
+ *- ENXIO
+ * - Version mismatch between micscif driver and libscif.
+ */
+scif_epd_t scif_open(void);
+
+/**
+ * scif _bind - Bind an endpoint to a port
+ * \param epd endpoint descriptor
+ * \param pn port number
+ *
+ * scif_bind() binds endpoint epd to port pn, where pn is a port number on the
+ * local node. If pn is zero, a port number greater than or equal to
+ * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to
+ * exactly one local port. Ports less than 1024 when requested can only be bound
+ * by system (or root) processes or by processes executed by privileged users.
+ *
+ *\return
+ * Upon successful completion, scif_bind() returns the port number to which epd
+ * is bound; otherwise: in user mode -1 is returned and errno is set to
+ * indicate the error; in kernel mode the negative of one of the following
+ * errors is returned.
+ *
+ *\par Errors:
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - The endpoint or the port are already bound.
+ *- EISCONN
+ * - The endpoint is already connected.
+ *- ENOSPC
+ * - No port number available for assignment (when pn==0).
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+ *- EACCES
+ * - The port requested is protected and the user is not the superuser.
+*/
+int scif_bind(scif_epd_t epd, uint16_t pn);
+
+/**
+ * scif_listen - Listen for connections on an endpoint
+ *
+ * \param epd endpoint descriptor
+ * \param backlog maximum pending connection requests
+ *
+ * scif_listen() marks the endpoint epd as a listening endpoint - that is, as
+ * an endpoint that will be used to accept incoming connection requests. Once
+ * so marked, the endpoint is said to be in the listening state and may not be
+ * used as the endpoint of a connection.
+ *
+ * The endpoint, epd, must have been bound to a port.
+ *
+ * The backlog argument defines the maximum length to which the queue of
+ * pending connections for epd may grow. If a connection request arrives when
+ * the queue is full, the client may receive an error with an indication that
+ * the connection was refused.
+ *
+ *\return
+ * Upon successful completion, scif_listen() returns 0; otherwise: in user mode
+ * -1 is returned and errno is set to indicate the error; in kernel mode the
+ * negative of one of the following errors is returned.
+ *
+ *\par Errors:
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - The endpoint is not bound to a port
+ *- EISCONN
+ * - The endpoint is already connected or listening
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+*/
+int scif_listen(scif_epd_t epd, int backlog);
+
+/**
+ * scif_connect - Initiate a connection on a port
+ * \param epd endpoint descriptor
+ * \param dst global id of port to which to connect
+ *
+ * The scif_connect() function requests the connection of endpoint epd to remote
+ * port dst. If the connection is successful, a peer endpoint, bound to dst, is
+ * created on node dst.node. On successful return, the connection is complete.
+ *
+ * If the endpoint epd has not already been bound to a port, scif_connect()
+ * will bind it to an unused local port.
+ *
+ * A connection is terminated when an endpoint of the connection is closed,
+ * either explicitly by scif_close(), or when a process that owns one of the
+ * endpoints of a connection is terminated.
+ *
+ *\return
+ * Upon successful completion, scif_connect() returns the port ID to which the
+ * endpoint, epd, is bound; otherwise: in user mode -1 is returned and errno is
+ * set to indicate the error; in kernel mode the negative of one of the
+ * following errors is returned.
+ *
+ *\par Errors:
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- ECONNREFUSED
+ * - The destination was not listening for connections or refused the
+ * connection request.
+ *- EINTR
+ * - Interrupted function
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - dst.port is not a valid port ID
+ *- EISCONN
+ * - The endpoint is already connected
+ *- ENOBUFS
+ * - No buffer space is available
+ *- ENODEV
+ * - The destination node does not exist, or
+ * - The node is lost.
+ *- ENOSPC
+ * - No port number available for assignment (when pn==0).
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+ *- EOPNOTSUPP
+ * - The endpoint is listening and cannot be connected
+*/
+int scif_connect(scif_epd_t epd, struct scif_portID *dst);
+
+/**
+ * scif_accept - Accept a connection on an endpoint
+ * \param epd endpoint descriptor
+ * \param peer global id of port to which connected
+ * \param newepd new connected endpoint descriptor
+ * \param flags flags
+ *
+ * The scif_accept() call extracts the first connection request on the queue of
+ * pending connections for the port on which epd is listening. scif_accept()
+ * creates a new endpoint, bound to the same port as epd, and allocates a new
+ * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new
+ * endpoint is connected to the endpoint through which the connection was
+ * requested. epd is unaffected by this call, and remains in the listening
+ * state.
+ *
+ * On successful return, peer holds the global port identifier (node id and
+ * local port number) of the port which requested the connection.
+ *
+ * If the peer endpoint which requested the connection is closed, the endpoint
+ * returned by scif_accept() is closed.
+ *
+ * The number of connections that can (subsequently) be accepted on epd is only
+ * limited by system resources (memory).
+ *
+ * The flags argument is formed by OR'ing together zero or more of the
+ * following values:
+ *- SCIF_ACCEPT_SYNC: block until a connection request is presented. If
+ * SCIF_ACCEPT_SYNC is not in flags, and no pending
+ * connections are present on the queue, scif_accept()fails
+ * with an EAGAIN error
+ *
+ * On Linux in user mode, the select() and poll() functions can be used to
+ * determine when there is a connection request. On Microsoft Windows* and on
+ * Linux in kernel mode, the scif_poll() function may be used for this purpose.
+ * A readable event will be delivered when a connection is requested.
+ *
+ *\return
+ * Upon successful completion, scif_accept() returns 0; otherwise: in user mode
+ * -1 is returned and errno is set to indicate the error; in kernel mode the
+ * negative of one of the following errors is returned.
+ *
+ *\par Errors:
+ *- EAGAIN
+ * - SCIF_ACCEPT_SYNC is not set and no connections are present to be accepted, or
+ * - SCIF_ACCEPT_SYNC is not set and remote node failed to complete its
+ * connection request
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- EINTR
+ * - Interrupted function
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - epd is not a listening endpoint
+ * - flags is invalid
+ * - peer is NULL
+ * - newepd is NULL
+ *- ENOBUFS
+ * - No buffer space is available
+ *- ENODEV
+ * - The requesting node is lost.
+ *- ENOMEM
+ * - Not enough space
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+ *- ENOENT
+ * - Secondary part of epd registeration failed.
+*/
+int scif_accept(scif_epd_t epd, struct scif_portID *peer, scif_epd_t
+*newepd, int flags);
+
+/**
+ * scif_close - Close an endpoint
+ * \param epd endpoint descriptor
+ *
+ * scif_close() closes an endpoint and performs necessary teardown of
+ * facilities associated with that endpoint.
+ *
+ * If epd is a listening endpoint then it will no longer accept connection
+ * requests on the port to which it is bound. Any pending connection requests
+ * are rejected.
+ *
+ * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs
+ * which are in-process through epd or its peer endpoint will complete before
+ * scif_close() returns. Registered windows of the local and peer endpoints are
+ * released as if scif_unregister() was called against each window.
+ *
+ * Closing an endpoint does not affect mappings to remote memory. These remain
+ * until explicitly removed by calling scif_munmap().
+ *
+ * If the peer endpoint's receive queue is not empty at the time that epd is
+ * closed, then the peer endpoint can be passed as the endpoint parameter to
+ * scif_recv() until the receive queue is empty.
+ *
+ * If epd is bound to a port, then the port is returned to the pool of
+ * available ports.
+ *
+ * epd is freed and may no longer be accessed.
+ *
+ *\return
+ * Upon successful completion, scif_close() returns 0; otherwise: in user mode
+ * -1 is returned and errno is set to indicate the error; in kernel mode the
+ * negative of one of the following errors is returned.
+ *
+ *\par Errors:
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor
+ */
+int scif_close(scif_epd_t epd);
+
+/**
+ * scif_send - Send a message
+ * \param epd endpoint descriptor
+ * \param msg message buffer address
+ * \param len message length
+ * \param flags blocking mode flags
+ *
+ * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data
+ * are copied from memory starting at address msg. On successful execution the
+ * return value of scif_send() is the number of bytes that were sent, and is
+ * zero if no bytes were sent because len was zero. scif_send() may be called
+ * only when the endpoint is in a connected state.
+ *
+ * If a scif_send() call is non-blocking, then it sends only those bytes which
+ * can be sent without waiting, up to a maximum of len bytes.
+ *
+ * If a scif_send() call is blocking, then it normally returns after sending
+ * all len bytes. If a blocking call is interrupted or the connection is
+ * forcibly closed, the call is considered successful if some bytes were sent
+ * or len is zero, otherwise the call is considered unsuccessful.
+ *
+ * On Linux in user mode, the select() and poll() functions can be used to
+ * determine when the send queue is not full. On Microsoft Windows* and on
+ * Linux in kernel mode, the scif_poll() function may be used for this purpose.
+ *
+ * It is recommended that scif_send()/scif_recv() only be used for short
+ * control-type message communication between SCIF endpoints. The SCIF RMA
+ * APIs are expected to provide better performance for transfer sizes of
+ * 1024 bytes or longer.
+ *
+ * The flags argument is formed by ORing together zero or more of the following
+ * values:
+ *- SCIF_SEND_BLOCK: block until the entire message is sent.
+ *
+ *\return
+ * Upon successful completion, scif_send() returns the number of bytes sent;
+ * otherwise: in user mode -1 is returned and errno is set to indicate the
+ * error; in kernel mode the negative of one of the following errors is
+ * returned.
+ *
+ *\par Errors:
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- ECONNRESET
+ * - A connection was forcibly closed by a peer.
+ *- EFAULT
+ * - An invalid address was specified for a parameter.
+ *- EINTR
+ * - epd was closed by scif_close()
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - flags is invalid
+ * - len is negative
+ *- ENODEV
+ * - The remote node is lost.
+ *- ENOMEM
+ * - Not enough space
+ *- ENOTCONN
+ * - The endpoint is not connected
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+ */
+int scif_send(scif_epd_t epd, void *msg, int len, int flags);
+
+/**
+ * scif_recv - Receive a message
+ * \param epd endpoint descriptor
+ * \param msg message buffer address
+ * \param len message buffer length
+ * \param flags blocking mode flags
+ *
+ * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of
+ * data are copied to memory starting at address msg. On successful execution
+ * the return value of scif_recv() is the number of bytes that were received,
+ * and is zero if no bytes were received because len was zero. scif_recv() may
+ * be called only when the endpoint is in a connected state.
+ *
+ * If a scif_recv() call is non-blocking, then it receives only those bytes
+ * which can be received without waiting, up to a maximum of len bytes.
+ *
+ * If a scif_recv() call is blocking, then it normally returns after receiving
+ * all len bytes. If a blocking call is interrupted or the connection is
+ * forcibly closed, the call is considered successful if some bytes were
+ * received or len is zero, otherwise the call is considered unsuccessful;
+ * subsequent calls to scif_recv() will successfully receive all data sent
+ * through peer endpoint interruption or the connection was forcibly closed.
+ *
+ * On Linux in user mode, the select() and poll() functions can be used to
+ * determine when data is available to be received. On Microsoft Windows* and
+ * on Linux in kernel mode, the scif_poll() function may be used for this
+ * purpose.
+ *
+ * It is recommended that scif_send()/scif_recv() only be used for short
+ * control-type message communication between SCIF endpoints. The SCIF RMA
+ * APIs are expected to provide better performance for transfer sizes of
+ * 1024 bytes or longer.
+ *
+ * The flags argument is formed by ORing together zero or more of the following
+ * values:
+ *- SCIF_RECV_BLOCK: block until the entire message is received.
+ *
+ *\return
+ * Upon successful completion, scif_recv() returns the number of bytes
+ * received; otherwise: in user mode -1 is returned and errno is set to
+ * indicate the error; in kernel mode the negative of one of the following
+ * errors is returned.
+ *
+ *\par Errors:
+ *- EAGAIN
+ * - The destination node is returning from a low power state.
+ *- EBADF
+ * - epd is not a valid endpoint descriptor .
+ *- ECONNRESET
+ * - A connection was forcibly closed by a peer.
+ *- EFAULT
+ * - An invalid address was specified for a parameter.
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - flags is invalid, or
+ * - len is negative.
+ *- ENODEV
+ * - The remote node is lost.
+ *- ENOMEM
+ * - Not enough space.
+ *- ENOTCONN
+ * - The endpoint is not connected.
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+ */
+int scif_recv(scif_epd_t epd, void *msg, int len, int flags);
+
+/**
+ * scif_register - Mark a memory region for remote access.
+ * \param epd endpoint descriptor
+ * \param addr starting virtual address
+ * \param len length of range
+ * \param offset offset of window
+ * \param prot_flags read/write protection flags
+ * \param map_flags mapping flags
+ *
+ * The scif_register() function opens a window, a range of whole pages of the
+ * registered address space of the endpoint epd, starting at offset po and
+ * continuing for len bytes. The value of po, further described below, is a
+ * function of the parameters offset and len, and the value of map_flags. Each
+ * page of the window represents the physical memory page which backs the
+ * corresponding page of the range of virtual address pages starting at addr
+ * and continuing for len bytes. addr and len are constrained to be multiples
+ * of the page size. addr is interpreted as a user space address. A successful
+ * scif_register() call returns po as the return value.
+ *
+ * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset
+ * exactly, and offset is constrained to be a multiple of the page size. The
+ * mapping established by scif_register() will not replace any existing
+ * registration; an error is returned if any page within the range [offset,
+ * offset+len-1] intersects an existing window.
+ * Note: When SCIF_MAP_FIXED is set the current implementation limits
+ * offset to the range [0..2^62-1] and returns EADDRINUSE if the offset
+ * requested with SCIF_MAP_FIXED is in the range [2^62..2^63-1].
+ *
+ * When SCIF_MAP_FIXED is not set, the implementation uses offset in an
+ * implementation-defined manner to arrive at po. The po value so chosen will
+ * be an area of the registered address space that the implementation deems
+ * suitable for a mapping of len bytes. An offset value of 0 is interpreted as
+ * granting the implementation complete freedom in selecting po, subject to
+ * constraints described below. A non-zero value of offset is taken to be a
+ * suggestion of an offset near which the mapping should be placed. When the
+ * implementation selects a value for po, it does not replace any extant
+ * window. In all cases, po will be a multiple of the page size.
+ *
+ * The physical pages which are so represented by a window are available for
+ * access in calls to scif_mmap(), scif_readfrom(), scif_writeto(),
+ * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the
+ * physical pages represented by the window will not be reused by the memory
+ * subsystem for any other purpose. Note that the same physical page may be
+ * represented by multiple windows.
+ *
+ * Subsequent operations which change the memory pages to which virtual
+ * addresses are mapped (such as mmap(), munmap(), scif_mmap() and
+ * scif_munmap()) have no effect on existing windows.
+ *
+ * On Linux, if the process will fork(), it is recommended that the registered
+ * virtual address range be marked with MADV_DONTFORK. Doing so will prevent
+ * problems due to copy-on-write semantics.
+ *
+ * The prot_flags argument is formed by OR'ing together one or more of the
+ * following values:
+ *- SCIF_PROT_READ: allow read operations from the window
+ *- SCIF_PROT_WRITE: allow write operations to the window
+ *
+ * The map_flags argument is formed by OR'ing together zero or more of
+ * the following values:
+ *- SCIF_MAP_FIXED: interpret offset exactly
+ *
+ *\return
+ * Upon successful completion, scif_register() returns the offset at which the
+ * mapping was placed (po); otherwise: in user mode SCIF_REGISTER_FAILED (that
+ * is (off_t *)-1) is returned and errno is set to indicate the error; in
+ * kernel mode the negative of one of the following errors is returned.
+ *
+ *\par Errors:
+ *- EADDRINUSE
+ * - SCIF_MAP_FIXED is set in map_flags, and pages in the range [offset,
+ * offset+len-1] are already registered
+ *- EAGAIN
+ * - The mapping could not be performed due to lack of resources
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- ECONNRESET
+ * - A connection was forcibly closed by a peer.
+ *- EFAULT
+ * - Addresses in the range [addr , addr + len - 1] are invalid
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - map_flags is invalid, or
+ * - prot_flags is invalid, or
+ * - SCIF_MAP_FIXED is set in flags, and offset is not a multiple of
+ * the page size, or
+ * - addr is not a multiple of the page size, or
+ * - len is not a multiple of the page size, or is 0, or
+ * - offset is negative
+ *- ENODEV
+ * - The remote node is lost.
+ *- ENOMEM
+ * - Not enough space
+ *- ENOTCONN
+ * - The endpoint is not connected
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+ */
+off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
+int prot_flags, int map_flags);
+
+/**
+ * scif_unregister - Mark a memory region for remote access.
+ * \param epd endpoint descriptor
+ * \param offset start of range to unregister
+ * \param len length of range to unregister
+ *
+ * The scif_unregister() function closes those previously registered windows
+ * which are entirely within the range [offset,offset+len-1]. It is an error to
+ * specify a range which intersects only a subrange of a window.
+ *
+ * On a successful return, pages within the window may no longer be specified
+ * in calls to scif_mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(),
+ * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window, however,
+ * continues to exist until all previous references against it are removed. A
+ * window is referenced if there is a mapping to it created by scif_mmap(), or if
+ * scif_get_pages() was called against the window (and the pages have not been
+ * returned via scif_put_pages()). A window is also referenced while an RMA, in
+ * which some range of the window is a source or destination, is in progress.
+ * Finally a window is referenced while some offset in that window was specified
+ * to scif_fence_signal(), and the RMAs marked by that call to
+ * scif_fence_signal() have not completed. While a window is in this state, its
+ * registered address space pages are not available for use in a new registered
+ * window.
+ *
+ * When all such references to the window have been removed, its references to
+ * all the physical pages which it represents are removed. Similarly, the
+ * registered address space pages of the window become available for
+ * registration in a new window.
+ *
+ *\return
+ * Upon successful completion, scif_unregister() returns 0; otherwise: in user
+ * mode -1 is returned and errno is set to indicate the error; in kernel mode
+ * the negative of one of the following errors is returned. In the event of an
+ * error, no windows are unregistered.
+ *
+ *\par Errors:
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- ECONNRESET
+ * - A connection was forcibly closed by a peer.
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - The range [offset,offset+len-1] intersects a subrange of a window, or
+ * - offset is negative
+ *- ENODEV
+ * -The remote node is lost.
+ *- ENOTCONN
+ * - The endpoint is not connected
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+ *- ENXIO
+ * - Addresses in the range [offset,offset+len-1] are invalid for the
+ * registered address space of epd.
+ */
+int scif_unregister(scif_epd_t epd, off_t offset, size_t len);
+
+
+/**
+ * scif_readfrom - Copy from a remote address space
+ * \param epd endpoint descriptor
+ * \param loffset offset in local registered address space to
+ * which to copy
+ * \param len length of range to copy
+ * \param roffset offset in remote registered address space
+ * from which to copy
+ * \param rma_flags transfer mode flags
+ *
+ * scif_readfrom() copies len bytes from the remote registered address space of
+ * the peer of endpoint epd, starting at the offset roffset to the local
+ * registered address space of epd, starting at the offset loffset.
+ *
+ * Each of the specified ranges [loffset,loffset+len-1] and [roffset,roffset+
+ * len-1] must be within some registered window or windows of the local and
+ * remote nodes respectively. A range may intersect multiple registered
+ * windows, but only if those windows are contiguous in the registered address
+ * space.
+ *
+ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
+ * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
+ * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the
+ * transfer is complete. Otherwise, the transfer may be performed asynchron-
+ * ously. The order in which any two aynchronous RMA operations complete
+ * is non-deterministic. The synchronization functions, scif_fence_mark()/
+ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
+ * the completion of asynchronous RMA operations.
+ *
+ * The DMA transfer of individual bytes is not guaranteed to complete in
+ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
+ * cacheline or partial cacheline of the source range will become visible on
+ * the destination node after all other transferred data in the source
+ * range has become visible on the destination node.
+ *
+ * The optimal DMA performance will likely be realized if both
+ * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
+ * performance will likely be realized if loffset and roffset are not
+ * cacheline aligned but are separated by some multiple of 64. The lowest level
+ * of performance is likely if loffset and roffset are not separated by a
+ * multiple of 64.
+ *
+ * The rma_flags argument is formed by ORing together zero or more of the
+ * following values:
+ *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA
+ * engine.
+ *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the
+ * transfer has completed. Passing this flag might result in
+ * the API busy waiting and consuming CPU cycles while the DMA
+ * transfer is in progress.
+ *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of
+ * the source range becomes visible on the destination node
+ * after all other transferred data in the source range has
+ * become visible on the destination
+ *
+ *\return
+ * Upon successful completion, scif_readfrom() returns 0; otherwise: in user
+ * mode -1 is returned and errno is set to indicate the error; in kernel mode
+ * the negative of one of the following errors is returned.
+ *
+ *\par Errors
+ *- EACCESS
+ * - Attempt to write to a read-only range or read from a write-only range
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- ECONNRESET
+ * - A connection was forcibly closed by a peer.
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - rma_flags is invalid
+ *- ENODEV
+ * -The remote node is lost.
+ *- ENOTCONN
+ * - The endpoint is not connected
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+ *- ENXIO
+ * - The range [loffset,loffset+len-1] is invalid for the registered address
+ * space of epd, or,
+ * - The range [roffset,roffset+len-1] is invalid for the registered address
+ * space of the peer of epd, or
+ * - loffset or roffset is negative
+*/
+int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t
+roffset, int rma_flags);
+
+/**
+ * scif_writeto - Copy to a remote address space
+ * \param epd endpoint descriptor
+ * \param loffset offset in local registered address space
+ * from which to copy
+ * \param len length of range to copy
+ * \param roffset offset in remote registered address space to
+ * which to copy
+ * \param rma_flags transfer mode flags
+ *
+ * scif_writeto() copies len bytes from the local registered address space of
+ * epd, starting at the offset loffset to the remote registered address space
+ * of the peer of endpoint epd, starting at the offset roffset.
+ *
+ * Each of the specified ranges [loffset,loffset+len-1] and [roffset,roffset+
+ * len-1] must be within some registered window or windows of the local and
+ * remote nodes respectively. A range may intersect multiple registered
+ * windows, but only if those windows are contiguous in the registered address
+ * space.
+ *
+ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
+ * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
+ * flags includes SCIF_RMA_SYNC, then scif_writeto() will return after the
+ * transfer is complete. Otherwise, the transfer may be performed asynchron-
+ * ously. The order in which any two aynchronous RMA operations complete
+ * is non-deterministic. The synchronization functions, scif_fence_mark()/
+ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
+ * the completion of asynchronous RMA operations.
+ *
+ * The DMA transfer of individual bytes is not guaranteed to complete in
+ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
+ * cacheline or partial cacheline of the source range will become visible on
+ * the destination node after all other transferred data in the source
+ * range has become visible on the destination node.
+ *
+ * The optimal DMA performance will likely be realized if both
+ * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
+ * performance will likely be realized if loffset and roffset are not cacheline
+ * aligned but are separated by some multiple of 64. The lowest level of
+ * performance is likely if loffset and roffset are not separated by a multiple
+ * of 64.
+ *
+ * The rma_flags argument is formed by ORing together zero or more of the
+ * following values:
+ *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA
+ * engine.
+ *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the
+ * transfer has completed. Passing this flag might result in
+ * the API busy waiting and consuming CPU cycles while the DMA
+ * transfer is in progress.
+ *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of
+ * the source range becomes visible on the destination node
+ * after all other transferred data in the source range has
+ * become visible on the destination
+ *
+ *\return
+ * Upon successful completion, scif_readfrom() returns 0; otherwise: in user
+ * mode -1 is returned and errno is set to indicate the error; in kernel mode
+ * the negative of one of the following errors is returned.
+ *
+ *\par Errors:
+ *- EACCESS
+ * - Attempt to write to a read-only range or read from a write-only range
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- ECONNRESET
+ * - A connection was forcibly closed by a peer.
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - rma_flags is invalid
+ *- ENODEV
+ * - The remote node is lost.
+ *- ENOTCONN
+ * - The endpoint is not connected
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+ *- ENXIO
+ * - The range [loffset,loffset+len-1] is invalid for the registered address
+ * space of epd, or,
+ * - The range [roffset , roffset + len -1] is invalid for the registered
+ * address space of the peer of epd, or
+ * - loffset or roffset is negative
+ */
+int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t
+roffset, int rma_flags);
+
+/**
+ * scif_vreadfrom - Copy from a remote address space
+ * \param epd endpoint descriptor
+ * \param addr address to which to copy
+ * \param len length of range to copy
+ * \param roffset offset in remote registered address space
+ * from which to copy
+ * \param rma_flags transfer mode flags
+ *
+ * scif_vreadfrom() copies len bytes from the remote registered address
+ * space of the peer of endpoint epd, starting at the offset roffset, to local
+ * memory, starting at addr. addr is interpreted as a user space address.
+ *
+ * The specified range [roffset,roffset+len-1] must be within some registered
+ * window or windows of the remote nodes respectively. The range may intersect
+ * multiple registered windows, but only if those windows are contiguous in the
+ * registered address space.
+ *
+ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
+ * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
+ * flags includes SCIF_RMA_SYNC, then scif_vreadfrom() will return after the
+ * transfer is complete. Otherwise, the transfer may be performed asynchron-
+ * ously. The order in which any two aynchronous RMA operations complete
+ * is non-deterministic. The synchronization functions, scif_fence_mark()/
+ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
+ * the completion of asynchronous RMA operations.
+ *
+ * The DMA transfer of individual bytes is not guaranteed to complete in
+ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
+ * cacheline or partial cacheline of the source range will become visible on
+ * the destination node after all other transferred data in the source
+ * range has become visible on the destination node.
+ *
+ * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
+ * the specified local memory range may be remain in a pinned state even after
+ * the specified transfer completes. This may reduce overhead if some or all of
+ * the same virtual address range is referenced in a subsequent call of
+ * scif_vreadfrom() or scif_vwriteto().
+ *
+ * The optimal DMA performance will likely be realized if both
+ * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
+ * performance will likely be realized if loffset and roffset are not
+ * cacheline aligned but are separated by some multiple of 64. The lowest level
+ * of performance is likely if loffset and roffset are not separated by a
+ * multiple of 64.
+ *
+ * The rma_flags argument is formed by ORing together zero or more of the
+ * following values:
+ *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA
+ * engine.
+ *- SCIF_RMA_USECACHE: enable registration caching
+ *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the
+ * transfer has completed. Passing this flag might result in
+ * the API busy waiting and consuming CPU cycles while the DMA
+ * transfer is in progress.
+ *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of
+ * the source range becomes visible on the destination node
+ * after all other transferred data in the source range has
+ * become visible on the destination
+ *
+ *\return
+ * Upon successful completion, scif_vreadfrom() returns 0; otherwise: in user
+ * mode -1 is returned and errno is set to indicate the error; in kernel mode
+ * the negative of one of the following errors is returned.
+ *
+ *\par Errors:
+ *- EACCESS
+ * - Attempt to write to a read-only range or read from a write-only range
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- ECONNRESET
+ * - A connection was forcibly closed by a peer.
+ *- EFAULT
+ * - Addresses in the range [addr,addr+len-1] are invalid
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - rma_flags is invalid
+ *- ENODEV
+ * - The remote node is lost.
+ *- ENOTCONN
+ * - The endpoint is not connected
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+ *- ENXIO
+ * - Addresses in the range [roffset,roffset+len-1] are invalid for the
+ * registered address space of epd.
+ */
+int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t offset,
+int rma_flags);
+
+/**
+ * scif_vwriteto - Copy to a remote address space
+ * \param epd endpoint descriptor
+ * \param addr address from which to copy
+ * \param len length of range to copy
+ * \param roffset offset in remote registered address space to
+ * which to copy
+ * \param rma_flags transfer mode flags
+ *
+ * scif_vwriteto() copies len bytes from the local memory, starting at addr, to
+ * the remote registered address space of the peer of endpoint epd, starting at
+ * the offset roffset. addr is interpreted as a user space address.
+ *
+ * The specified range [roffset,roffset+len-1] must be within some registered
+ * window or windows of the remote nodes respectively. The range may intersect
+ * multiple registered windows, but only if those windows are contiguous in the
+ * registered address space.
+ *
+ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
+ * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
+ * flags includes SCIF_RMA_SYNC, then scif_vwriteto() will return after the
+ * transfer is complete. Otherwise, the transfer may be performed asynchron-
+ * ously. The order in which any two aynchronous RMA operations complete
+ * is non-deterministic. The synchronization functions, scif_fence_mark()/
+ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
+ * the completion of asynchronous RMA operations.
+ *
+ * The DMA transfer of individual bytes is not guaranteed to complete in
+ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
+ * cacheline or partial cacheline of the source range will become visible on
+ * the destination node after all other transferred data in the source
+ * range has become visible on the destination node.
+ *
+ * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
+ * the specified local memory range may be remain in a pinned state even after
+ * the specified transfer completes. This may reduce overhead if some or all of
+ * the same virtual address range is referenced in a subsequent call of
+ * scif_vreadfrom() or scif_vwriteto().
+ *
+ * The optimal DMA performance will likely be realized if both
+ * addr and offset are cacheline aligned (are a multiple of 64). Lower
+ * performance will likely be realized if addr and offset are not cacheline
+ * aligned but are separated by some multiple of 64. The lowest level of
+ * performance is likely if addr and offset are not separated by a multiple of
+ * 64.
+ *
+ * The rma_flags argument is formed by ORing together zero or more of the
+ * following values:
+ *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA
+ * engine.
+ *- SCIF_RMA_USECACHE: allow registration caching
+ *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the
+ * transfer has completed. Passing this flag might result in
+ * the API busy waiting and consuming CPU cycles while the DMA
+ * transfer is in progress.
+ *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of
+ * the source range becomes visible on the destination node
+ * after all other transferred data in the source range has
+ * become visible on the destination
+ *
+ *\return
+ * Upon successful completion, scif_vwriteto () returns 0; otherwise: in user
+ * mode -1 is returned and errno is set to indicate the error; in kernel mode
+ * the negative of one of the following errors is returned.
+ *
+ *\par Errors:
+ *- EACCESS
+ * - Attempt to write to a read-only range or read from a write-only range
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- ECONNRESET
+ * - A connection was forcibly closed by a peer.
+ *- EFAULT
+ * - Addresses in the range [addr,addr+len-1] are invalid
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - rma_flags is invalid
+ *- ENODEV
+ * - The remote node is lost.
+ *- ENOTCONN
+ * - The endpoint is not connected
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+ *- ENXIO
+ * - Addresses in the range [roffset,roffset+len-1] are invalid for the
+ * registered address space of epd.
+ */
+int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t offset,
+int rma_flags);
+
+/**
+ * scif_fence_mark - Mark previously issued RMAs
+ * \param epd endpoint descriptor
+ * \param flags control flags
+ * \param mark marked handle returned as output.
+ *
+ * scif_fence_mark() returns after marking the current set of all uncompleted
+ * RMAs initiated through the endpoint epd or the current set of all
+ * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are
+ * marked with a value returned at mark. The application may subsequently call
+ * scif_fence_wait(), passing the value returned at mark, to await completion
+ * of all RMAs so marked.
+ *
+ * The flags argument has exactly one of the following values:
+ *- SCIF_FENCE_INIT_SELF: RMA operations initiated through endpoint
+ * epd are marked
+ *- SCIF_FENCE_INIT_PEER: RMA operations initiated through the peer
+ * of endpoint epd are marked
+ *
+ * \return
+ * Upon successful completion, scif_fence_mark() returns 0; otherwise: in user
+ * mode -1 is returned and errno is set to indicate the error; in kernel mode
+ * the negative of one of the following errors is returned.
+ *
+ *\par Errors:
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- ECONNRESET
+ * - A connection was forcibly closed by a peer.
+ *- EINVAL
+ * - flags is invalid, or
+ * - epd is not a valid endpoint descriptor, or
+ *- ENODEV
+ * - The remote node is lost.
+ *- ENOTCONN
+ * - The endpoint is not connected
+ *- ENOMEM
+ * - Insufficient kernel memory was available.
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+ */
+int scif_fence_mark(scif_epd_t epd, int flags, int *mark);
+
+/**
+ * scif_fence_wait - Wait for completion of marked RMAs
+ *
+ * \param epd endpoint descriptor
+ * \param mark mark request
+ *
+ * scif_fence_wait() returns after all RMAs marked with mark have completed.
+ * The value passed in mark must have been obtained in a previous call to
+ * scif_fence_mark().
+ *
+ *\return
+ * Upon successful completion, scif_fence_wait() returns 0; otherwise: in user
+ * mode -1 is returned and errno is set to indicate the error; in kernel mode
+ * the negative of one of the following errors is returned.
+ *
+ *\par Errors:
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- ECONNRESET
+ * - A connection was forcibly closed by a peer.
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ *- ENODEV
+ * - The remote node is lost.
+ *- ENOTCONN
+ * - The endpoint is not connected
+ *- ENOMEM
+ * - Insufficient kernel memory was available.
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+ */
+int scif_fence_wait(scif_epd_t epd, int mark);
+
+/**
+ * scif_fence_signal - Request a signal on completion of RMAs
+ * \param loff local offset
+ * \param lval local value to write to loffset
+ * \param roff remote offset
+ * \param rval remote value to write to roffset
+ * \param flags flags
+ *
+ * scif_fence_signal() returns after marking the current set of all uncompleted
+ * RMAs initiated through the endpoint epd or marking the current set of all
+ * uncompleted RMAs initiated through the peer of endpoint epd.
+ *
+ * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the
+ * marked set, lval is written to memory at the address corresponding to offset
+ * loff in the local registered address space of epd. loff must be within a
+ * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion
+ * of the RMAs in the marked set, rval is written to memory at the * address
+ * corresponding to offset roff in the remote registered address space of epd.
+ * roff must be within a remote registered window of the peer of epd. Note
+ * that any specified offset must be DWORD (4 byte / 32 bit) aligned.
+ *
+ * The flags argument is formed by OR'ing together the following:
+ *- Exactly one of the following values:
+ * - SCIF_FENCE_INIT_SELF: RMA operations initiated through endpoint
+ * epd are marked
+ * - SCIF_FENCE_INIT_PEER: RMA operations initiated through the peer
+ * of endpoint epd are marked
+ *- One or more of the following values:
+ * - SCIF_SIGNAL_LOCAL: On completion of the marked set of RMAs, write lval to
+ * memory at the address corresponding to offset loff in the local registered
+ * address space of epd.
+ * - SCIF_SIGNAL_REMOTE: On completion of the marked set of RMAs, write lval to
+ * memory at the address corresponding to offset roff in the remote registered
+ * address space of epd.
+ *
+ *\return
+ * Upon successful completion, scif_fence_signal() returns 0; otherwise: in
+ * user mode -1 is returned and errno is set to indicate the error; in kernel
+ * mode the negative of one of the following errors is returned.
+ *\par Errors:
+ *- EBADF
+ * - epd is not a valid endpoint descriptor
+ *- ECONNRESET
+ * - A connection was forcibly closed by a peer.
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - flags is invalid, or
+ * - loff or roff are not DWORD aligned
+ *- ENODEV
+ * - The remote node is lost.
+ *- ENOTCONN
+ * - The endpoint is not connected
+ *- ENOTTY
+ * - epd is not a valid endpoint descriptor
+ *- ENXIO
+ * - loff is invalid for the registered address of epd, or
+ * - roff is invalid for the registered address space, of the peer of epd
+ */
+int scif_fence_signal(scif_epd_t epd, off_t loff, uint64_t lval, off_t roff,
+uint64_t rval, int flags);
+
+/**
+ * scif_get_nodeIDs - Return information about online nodes
+ * \param nodes array in which to return online node IDs
+ * \param len number of entries in the nodes array
+ * \param self address to place the node ID of the local node
+ *
+ * scif_get_nodeIDs() fills in the nodes array with up to len node IDs of the
+ * nodes in the SCIF network. If there is not enough space in nodes, as
+ * indicated by the len parameter, only len node IDs are returned in nodes. The
+ * return value of scif_get_nodeID() is the total number of nodes currently in
+ * the SCIF network. By checking the return value against the len parameter, the user may
+ * determine if enough space for nodes was allocated.
+ *
+ * The node ID of the local node is returned at self.
+ *
+ *\return
+ * Upon successful completion, scif_get_nodeIDs() returns the actual number of
+ * online nodes in the SCIF network including 'self'; otherwise: in user mode
+ * -1 is returned and errno is set to indicate the error; in kernel mode no
+ * errors are returned.
+ *
+ *\par Errors:
+ *- EFAULT
+ * - Bad address
+ */
+int scif_get_nodeIDs(uint16_t *nodes, int len, uint16_t *self);
+
+
+/**
+ * scif_pin_pages - Pin a set of pages
+ * \param addr Virtual address of range to pin
+ * \param len Length of range to pin
+ * \param prot_flags Page protection flags
+ * \param map_flags Page classification flags
+ * \param pinned_pages Opaque handle of pinned pages
+ *
+ * scif_pin_pages() pins (locks in physical memory) the physical pages which
+ * back the range of virtual address pages starting at addr and continuing for
+ * len bytes. addr and len are constrained to be multiples of the page size. A
+ * successful scif_register() call returns an opaque pointer value at
+ * pinned_pages which may be used in subsequent calls to
+ * scif_register_pinned_pages().
+ *
+ * The pages will remain pinned as long as there is a reference against the
+ * scif_pinned_pages_t value returned by scif_pin_pages() and until
+ * scif_unpin_pages() is called, passing the scif_pinned_pages_t value. A
+ * reference is added to a scif_pinned_pages_t value each time a window is
+ * created by calling scif_register_pinned_pages() and passing the
+ * scif_pinned_pages_t value. A reference is removed from a scif_pinned_pages_t value
+ * each time such a window is deleted.
+ *
+ * Subsequent operations which change the memory pages to which virtual
+ * addresses are mapped (such as mmap(), munmap(), scif_mmap() and
+ * scif_munmap()) have no effect on the scif_pinned_pages_t value or windows
+ * created against it.
+ *
+ * On Linux, if the process will fork(), it is recommended that the registered
+ * virtual address range be marked with MADV_DONTFORK. Doing so will prevent
+ * problems due to copy-on-write semantics.
+ *
+ * The prot_flags argument is formed by OR'ing together one or more of the
+ * following values:
+ *- SCIF_PROT_READ: allow read operations against the pages
+ *- SCIF_PROT_WRITE: allow write operations against the pages
+ * The map_flags argument is formed by OR'ing together zero or more of the
+ * following values:
+ *- SCIF_MAP_KERNEL: interpret addr as a kernel space address. By default, addr
+ * is interpreted as a user space address.
+ *
+ *\return
+ * Upon successful completion, scif_register() returns 0; otherwise the
+ * negative of one of the following errors is returned.
+ *\par Errors:
+ *- EFAULT
+ * - Addresses in the range [addr,addr+len-1] are invalid
+ *- EINVAL
+ * - prot_flags is invalid,
+ * - map_flags is invalid, or
+ * - offset is negative
+ *- ENOMEM
+ * - Not enough space
+ */
+int
+scif_pin_pages(
+ void *addr,
+ size_t len,
+ int prot_flags,
+ int map_flags,
+ scif_pinned_pages_t *pinned_pages);
+
+/**
+ * scif_unpin_pages - Unpin a set of pages
+ * \param pinned_pages Opaque handle of pages to be unpinned
+ *
+ * scif_unpin_pages() prevents scif_register_pinned_pages()from registering new
+ * windows against pinned_pages. The physical pages represented by pinned_pages
+ * will remain pinned until all windows previously registered against
+ * pinned_pages are deleted (the window is scif_unregister()'d and all
+ * references to the window are removed (see scif_unregister()).
+ *
+ * pinned_pages must have been obtain from a previous call to scif_pin_pages().
+ * After calling scif_unpin_pages(), it is an error to pass pinned_pages to
+ * scif_register_pinned_pages().
+ *
+ *\return:
+ * Upon successful completion, scif_unpin_pages() returns 0; otherwise the
+ * negative of one of the following errors is returned.
+ *
+ *\par Errors:
+ *- EINVAL
+ * - pinned_pages is not valid
+ */
+int
+scif_unpin_pages(
+ scif_pinned_pages_t pinned_pages);
+
+/**
+ * scif_register_pinned_pages - Mark a memory region for remote access.
+ * \param epd Endpoint descriptor
+ * \param pinned_pages Opaque handle of pinned pages
+ * \param offset Registered address space offset
+ * \param map_flags Flags which control where pages are mapped
+ *
+ * The scif_register_pinned_pages() function opens a window, a range of whole
+ * pages of the registered address space of the endpoint epd, starting at
+ * offset po. The value of po, further described below, is a function of the
+ * parameters offset and pinned_pages, and the value of map_flags. Each page of
+ * the window represents a corresponding physical memory page of the range
+ * represented by pinned_pages; the length of the window is the same as the
+ * length of range represented by pinned_pages. A successful scif_register()
+ * call returns po as the return value.
+ *
+ * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset
+ * exactly, and offset is constrained to be a multiple of the page size. The
+ * mapping established by scif_register() will not replace any existing
+ * registration; an error is returned if any page of the new window would
+ * intersect an existing window.
+ *
+ * When SCIF_MAP_FIXED is not set, the implementation uses offset in an
+ * implementation-defined manner to arrive at po. The po so chosen will be an
+ * area of the registered address space that the implementation deems suitable
+ * for a mapping of the required size. An offset value of 0 is interpreted as
+ * granting the implementation complete freedom in selecting po, subject to
+ * constraints described below. A non-zero value of offset is taken to be a
+ * suggestion of an offset near which the mapping should be placed. When the
+ * implementation selects a value for po, it does not replace any extant
+ * window. In all cases, po will be a multiple of the page size.
+ *
+ * The physical pages which are so represented by a window are available for
+ * access in calls to scif_get_pages(), scif_readfrom(), scif_writeto(),
+ * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the
+ * physical pages represented by the window will not be reused by the memory
+ * subsytem for any other purpose. Note that the same physical page may be
+ * represented by multiple windows.
+ *
+ * Windows created by scif_register_pinned_pages() are unregistered by
+ * scif_unregister().
+ *
+ * The map_flags argument is formed by OR'ing together zero or more of the
+ * following values:
+ *- SCIF_MAP_FIXED: interpret offset exactly
+ *
+ *\return
+ * Upon successful completion, scif_register_pinned_pages() returns the offset
+ * at which the mapping was placed (po); otherwise the negative of one of the
+ * following errors is returned.
+ *\par Errors:
+ *- EADDRINUSE
+ * - SCIF_MAP_FIXED is set in map_flags and pages in the new
+ * window would intersect an existing window
+ *- EAGAIN
+ * - The mapping could not be performed due to lack of resources
+ *- ECONNRESET
+ * - A connection was forcibly closed by a peer.
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - map_flags is invalid, or
+ * - SCIF_MAP_FIXED is set in map_flags, and offset is not a
+ * multiple of the page size, or
+ * - offset is negative
+ *- ENODEV
+ * - The remote node is lost.
+ *- ENOMEM
+ * - Not enough space
+ *- ENOTCONN
+ * - The endpoint is not connected
+ */
+off_t
+scif_register_pinned_pages(
+ scif_epd_t epd,
+ scif_pinned_pages_t pinned_pages,
+ off_t offset,
+ int map_flags);
+
+/**
+ * scif_get_pages - Add references to remote registered pages
+ * \param epd endpoint descriptor
+ * \param offset registered address space offset
+ * \param len length of range of pages
+ * \param pages returned scif_range structure
+ *
+ * scif_get_pages() returns the addresses of the physical pages represented by
+ * those pages of the registered address space of the peer of epd, starting at
+ * offset and continuing for len bytes. offset and len are constrained to be
+ * multiples of the page size.
+ *
+ * All of the pages in the specified range [offset,offset+len-1] must be within
+ * a single window of the registered address space of the peer of epd.
+ *
+ * The addresses are returned as a virtually contiguous array pointed to by the
+ * phys_addr component of the scif_range structure whose address is returned in
+ * pages. The nr_pages component of scif_range is the length of the array. The
+ * prot_flags component of scif_range holds the protection flag value passed
+ * when the pages were registered.
+ *
+ * Each physical page whose address is returned by scif_get_pages() remains
+ * available and will not be released for reuse until the scif_range structure
+ * is returned in a call to scif_put_pages(). The scif_range structure returned
+ * by scif_get_pages() must be unmodified.
+ *
+ * It is an error to call scif_close() on an endpoint on which a scif_range
+ * structure of that endpoint has not been returned to scif_put_pages().
+ *
+ *\return
+ * Upon successful completion, scif_get_pages() returns 0; otherwise the
+ * negative of one of the following errors is returned.
+ *\par Errors:
+ *- ECONNRESET
+ * - A connection was forcibly closed by a peer.
+ *- EINVAL
+ * - epd is not a valid endpoint descriptor, or
+ * - offset is not a multiple of the page size, or
+ * - offset is negative, or
+ * - len is not a multiple of the page size
+ *- ENODEV
+ * -The remote node is lost.
+ *- ENOTCONN
+ * - The endpoint is not connected
+ *- ENXIO
+ * - Addresses in the range [offset,offset+len-1] are invalid
+ * for the registered address space of the peer epd.
+ */
+int scif_get_pages(
+ scif_epd_t epd,
+ off_t offset,
+ size_t len,
+ struct scif_range **pages);
+
+/**
+ * scif_put_pages - Remove references from remote registered pages
+ * \param pages pages to be returned
+ *
+ * scif_put_pages() releases a scif_range structure previously obtained by
+ * calling scif_get_pages(). The physical pages represented by pages may
+ * be reused when the window which represented those pages is unregistered.
+ * Therefore, those pages must not be accessed after calling scif_put_pages().
+ *
+ *\return
+ * Upon successful completion, scif_put_pages() returns 0; otherwise the
+ * negative of one of the following errors is returned.
+ *\par Errors:
+ *- EINVAL
+ * - pages does not point to a valid scif_range structure, or
+ * - the scif_range structure pointed to by pages was already returned.
+ *- ENODEV
+ * - The remote node is lost.
+ *- ENOTCONN
+ * - The endpoint is not connected.
+ */
+int scif_put_pages(
+ struct scif_range *pages);
+
+/**
+ * scif_poll - Wait for some event on an endpoint
+ * \param epds Array of endpoint descriptors
+ * \param nepds Length of epds
+ * \param timeout Upper limit on time for which scif_poll() will
+ * block
+ *
+ * scif_poll() waits for one of a set of endpoints to become ready to perform
+ * an I/O operation. scif_poll() exposes a subset of the functionality of the
+ * POSIX standard poll() function.
+ *
+ * The epds argument specifies the endpoint descriptors to be examined and the
+ * events of interest for each endpoint descriptor. epds is a pointer to an
+ * array with one member for each open endpoint descriptor of interest.
+ *
+ * The number of items in the epds array is specified in nepds. The epd field
+ * of scif_pollepd is an endpoint descriptor of an open endpoint. The field
+ * events is a bitmask specifying the events which the application is
+ * interested in. The field revents is an output parameter, filled by the
+ * kernel with the events that actually occurred. The bits returned in revents
+ * can include any of those specified in events, or one of the values
+ * SCIF_POLLERR, SCIF_POLLHUP, or SCIF_POLLNVAL. (These three bits are
+ * meaningless in the events field, and will be set in the revents field
+ * whenever the corresponding condition is true.)
+ *
+ * If none of the events requested (and no error) has occurred for any of the
+ * endpoint descriptors, then scif_poll() blocks until one of the events occurs.
+ *
+ * The timeout argument specifies an upper limit on the time for which
+ * scif_poll() will block, in milliseconds. Specifying a negative value in
+ * timeout means an infinite timeout.
+ *
+ * The following bits may be set in events and returned in revents:
+ *- SCIF_POLLIN: Data may be received without blocking. For a connected
+ * endpoint, this means that scif_recv() may be called without blocking. For a
+ * listening endpoint, this means that scif_accept() may be called without
+ * blocking.
+ *- SCIF_POLLOUT: Data may be sent without blocking. For a connected endpoint,
+ * this means that scif_send() may be called without blocking. This bit value
+ * has no meaning for a listening endpoint and is ignored if specified.
+ *
+ * The following bits are only returned in revents, and are ignored if set in
+ * events:
+ *- SCIF_POLLERR: An error occurred on the endpoint
+ *- SCIF_POLLHUP: The connection to the peer endpoint was disconnected
+ *- SCIF_POLLNVAL: The specified endpoint descriptor is invalid.
+ *
+ *\return
+ * Upon successful completion, scif_poll()returns a non-negative value. A
+ * positive value indicates the total number of endpoint descriptors that have
+ * been selected (that is, endpoint descriptors for which the revents member is
+ * non-zero. A value of 0 indicates that the call timed out and no endpoint
+ * descriptors have been selected. Otherwise: in user mode -1 is returned and
+ * errno is set to indicate the error; in kernel mode the negative of one of
+ * the following errors is returned.
+ *
+ *\par Errors:
+ *- EFAULT
+ * - The array given as argument was not contained in the calling program's
+ * address space.
+ *- EINTR
+ * - A signal occurred before any requested event.
+ *- EINVAL
+ * - The nepds argument is greater than {OPEN_MAX}
+ *- ENOMEM
+ * - There was no space to allocate file descriptor tables.
+*/
+int
+scif_poll(
+ struct scif_pollepd *epds,
+ unsigned int nepds,
+ long timeout);
+
+/**
+ * scif_event_register - Register an event handler
+ * \param handler Event handler to be registered
+ *
+ * scif_event_register() registers a routine, handler, to be called when some
+ * event occurs. The event parameter to handler indicates the type of event
+ * which has occurred, and the corresponding component of the data parameter to
+ * handler provides additional data about the event.
+ *
+ * The following events are defined:
+ *- SCIF_NODE_ADDED: A node has been added to the SCIF network. The
+ * scif_node_added component of the data parameter to handler identifies the
+ * node. This event is informational. There are no requirements on the event
+ * handler.
+ *- SCIF_NODE_REMOVED: A node is being removed from the SCIF network. The
+ * scif_node_removed component of the data parameter to handler identifies the
+ * node. Upon being called, and before returning, the event handler must
+ * return, using scif_put_pages(), all structures obtained using
+ * scif_get_pages() against an endpoint connected to the lost node. It is
+ * recommended and expected that the handler will also scif_close() all
+ * endpoints connected to the lost node.
+ *
+ *\return
+ * Upon successful completion scif_event_register() returns 0.
+ *
+ *\par Errors:
+ *- ENOMEM
+ * - There was no space to allocate file descriptor tables.
+*/
+
+int
+scif_event_register(
+ scif_callback_t handler);
+
+/**
+ * scif_event_unregister - Unregister event handler
+ * \param handler Event handler to be unregistered
+ *
+ * scif_event_unregister() unregisters the handler which was registered
+ * previously by using scif_event_register().
+ *
+ * WARNING: scif_event_unregister must be called before the module
+ * (that registered handles) exits for every handler that is registered.
+ * Failure to do so will result in crash of the scif module.
+ *
+ *\return
+ * Upon successful completion scif_event_unregister() returns 0.
+ *\par Errors:
+ *- EINVAL
+ * -If the event handler was not found/registered.
+*/
+int
+scif_event_unregister(
+ scif_callback_t handler);
+
+/*
+ * Note: The callee can use pci_resource_start(dev, index) and
+ * pci_resource_len(dev, index) to obtain the PCI resource starting
+ * physical address and length for valid non null indexes of the va
+ * array. MMIO bars will not have IORESOURCE_PREFETCH set in the
+ * flags obtained from pci_resource_flags(dev, index). va[index]
+ * will be set to NULL for invalid resources.
+ */
+struct scif_pci_info {
+ /* pci_dev pointer associated with a node */
+ struct pci_dev *pdev;
+ /* Ioremapped virtual address base for every valid PCIe resource */
+ void __iomem *va[PCI_NUM_RESOURCES];
+};
+
+/**
+ * scif_pci_info - Populate the scif_pci_info structure for a node.
+ * \param node The node to query
+ * \param dev The scif_pci_info structure to populate.
+ *
+ * scif_pci_info() populates the provided scif_pci_info structure
+ * associated with a node. The requested node ID cannot be the same as
+ * the current node. This routine will only return success when called from
+ * the host.
+ *
+ *\return
+ * Upon successful completion, scif_pci_info() returns 0; otherwise the
+ * negative of one of the following errors is returned.
+ *
+ *\par Errors:
+ *- EINVAL
+ * - The requested node is not valid.
+ * - Called on MIC instead of the host.
+ *- ENODEV
+ * - No pci_dev association exists for the node.
+ */
+int
+scif_pci_info(
+ uint16_t node,
+ struct scif_pci_info *dev);
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* __SCIF_H__ */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * -----------------------------------------
+ * SCIF IOCTL interface information
+ * -----------------------------------------
+ */
+#if defined(_WIN32) && !defined(_WIN64)
+#define ptr64_t __ptr64
+#else
+#define ptr64_t
+#endif
+
+/**
+ * The purpose of SCIF_VERSION is to check for compatibility between host and
+ * card SCIF modules and also between SCIF driver and libscif. This version
+ * should be incremented whenever a change is made to SCIF that affects the
+ * interface between SCIF driver and libscif or between the card and host SCIF
+ * driver components.
+ */
+#define SCIF_VERSION 1
+
+/**
+ * struct scifioctl_connect:
+ *
+ * \param self used to read back the assigned portID
+ * \param peer destination node and port to connect to
+ *
+ * This structure is used for CONNECT IOCTL.
+ */
+struct scifioctl_connect {
+ struct scif_portID self;
+ struct scif_portID peer;
+};
+
+
+/**
+ * struct scifioctl_accept:
+ *
+ * \param flags flags
+ * \param peer global id of peer endpoint
+ * \param newepd new connected endpoint descriptor
+ *
+ * This structure is used for SCIF_ACCEPTREQ IOCTL.
+ */
+struct scifioctl_accept {
+ int flags;
+ struct scif_portID peer;
+ void * ptr64_t endpt;
+};
+
+/**
+ * struct scifioctl_msg:
+ *
+ * \param msg message buffer address
+ * \param len message length
+ * \param flags flags
+ * \param out_len Number of bytes sent/received.
+ *
+ * This structure is used for SCIF_SEND/SCIF_RECV IOCTL.
+ */
+struct scifioctl_msg {
+ void * ptr64_t msg;
+ int len;
+ int flags;
+ int out_len;
+};
+
+/**
+ * struct scifioctl_reg:
+ *
+ * \param addr starting virtual address
+ * \param len length of range
+ * \param offset offset of window
+ * \param prot read/write protection
+ * \param flags flags
+ * \param out_len offset returned.
+ *
+ * This structure is used for SCIF_REG IOCTL.
+ */
+struct scifioctl_reg {
+ void * ptr64_t addr;
+ uint64_t len;
+ off_t offset;
+ int prot;
+ int flags;
+ off_t out_offset;
+};
+
+/**
+ * struct scifioctl_unreg:
+ *
+ * \param offset start of range to unregister
+ * \param len length of range to unregister
+ *
+ * This structure is used for SCIF_UNREG IOCTL.
+ */
+struct scifioctl_unreg {
+ off_t offset;
+ uint64_t len;
+};
+
+/**
+ * struct scifioctl_copy:
+ *
+ * \param loffset offset in local registered address space to/from
+which to copy
+ * \param len length of range to copy
+ * \param roffset offset in remote registered address space to/from
+which to copy
+ * \param addr user virtual address to/from which to copy
+ * \param flags flags
+ *
+ * This structure is used for SCIF_READFROM, SCIF_WRITETO, SCIF_VREADFROM
+and
+ * SCIF_VREADFROM IOCTL's.
+ */
+struct scifioctl_copy {
+ off_t loffset;
+ uint64_t len;
+ off_t roffset;
+ uint8_t * ptr64_t addr;
+ int flags;
+};
+
+/**
+ * struct scifioctl_fence_mark:
+ *
+ * \param flags flags
+ * \param mark Fence handle returned by reference.
+ *
+ * This structure is used from SCIF_FENCE_MARK IOCTL.
+ */
+struct scifioctl_fence_mark {
+ int flags;
+ int *mark;
+};
+
+/**
+ * struct scifioctl_fence_signal:
+ *
+ * \param loff local offset
+ * \param lval local value to write to loffset
+ * \param roff remote offset
+ * \param rval remote value to write to roffset
+ * \param flags flags
+ *
+ * This structure is used for SCIF_FENCE_SIGNAL IOCTL.
+ */
+struct scifioctl_fence_signal {
+ off_t loff;
+ uint64_t lval;
+ off_t roff;
+ uint64_t rval;
+ int flags;
+};
+
+/**
+ * struct scifioctl_nodeIDs:
+ *
+ * \param nodes pointer to an array of nodeIDs
+ * \param len length of array
+ * \param self ID of the current node
+ *
+ * This structure is used for the SCIF_GET_NODEIDS ioctl
+ */
+struct scifioctl_nodeIDs {
+ uint16_t * ptr64_t nodes;
+ int len;
+ uint16_t * ptr64_t self;
+};
+
+
+#define SCIF_BIND _IOWR('s', 1, int *)
+#define SCIF_LISTEN _IOW('s', 2, int)
+#define SCIF_CONNECT _IOWR('s', 3, struct scifioctl_connect *)
+#define SCIF_ACCEPTREQ _IOWR('s', 4, struct scifioctl_accept *)
+#define SCIF_ACCEPTREG _IOWR('s', 5, void *)
+#define SCIF_SEND _IOWR('s', 6, struct scifioctl_msg *)
+#define SCIF_RECV _IOWR('s', 7, struct scifioctl_msg *)
+#define SCIF_REG _IOWR('s', 8, struct scifioctl_reg *)
+#define SCIF_UNREG _IOWR('s', 9, struct scifioctl_unreg *)
+#define SCIF_READFROM _IOWR('s', 10, struct scifioctl_copy *)
+#define SCIF_WRITETO _IOWR('s', 11, struct scifioctl_copy *)
+#define SCIF_VREADFROM _IOWR('s', 12, struct scifioctl_copy *)
+#define SCIF_VWRITETO _IOWR('s', 13, struct scifioctl_copy *)
+#define SCIF_GET_NODEIDS _IOWR('s', 14, struct scifioctl_nodeIDs *)
+#define SCIF_FENCE_MARK _IOWR('s', 15, struct scifioctl_fence_mark *)
+#define SCIF_FENCE_WAIT _IOWR('s', 16, int)
+#define SCIF_FENCE_SIGNAL _IOWR('s', 17, struct scifioctl_fence_signal *)
+
+#define SCIF_GET_VERSION _IO('s', 23)
--- /dev/null
+# Options for the Intel Many Integrated Core Co-processor card driver
+#
+# p2p enables the use of the SCIF interface peer to peer communication
+# 1 to enable or 0 to disable
+#
+# p2p_proxy enables the use of SCIF P2P Proxy DMA which converts DMA
+# reads into DMA writes for performance on certain Intel platforms.
+# 1 to enable or 0 to disable
+#
+# reg_cache enables SCIF Registration Caching
+# 1 to enable or 0 to disable
+#
+# huge_page enables SCIF Huge Page Support
+# 1 to enable or 0 to disable
+#
+# watchdog enables the SCIF watchdog for Lost Node detection.
+# 1 to enable or 0 to disable
+#
+# watchdog_auto_reboot configures the behavior of the MIC host driver
+# upon detection of a lost node. This option is a nop if watchdog=0.
+# 1 Allow the host driver to reboot the node back to "online" state
+# 0 Allow the host driver to reset the node back to "ready" state.
+# It will be upto the user to reboot the node or not.
+#
+# crash_dump enables uOS Kernel Crash Dump Captures
+# 1 to enable or 0 to disable
+#
+# ulimit enables ulimit checks on max locked memory for scif_register
+# 1 to enable or 0 to disable
+#
+options mic reg_cache=1 huge_page=1 watchdog=1 watchdog_auto_reboot=1 crash_dump=1 p2p=1 p2p_proxy=1 ulimit=0
+options mic_host reg_cache=1 huge_page=1 watchdog=1 watchdog_auto_reboot=1 crash_dump=1 p2p=1 p2p_proxy=1 ulimit=0
--- /dev/null
+#!/bin/sh
+
+if [ ! -d /sys/class/mic ]; then
+ exec /sbin/modprobe mic >/dev/null 2>&1
+fi
--- /dev/null
+obj-m := ringbuffer.o
+obj-m += micscif.o
+
+ringbuffer-objs := micscif_rb.o
+
+micscif-objs := micscif_main.o
+micscif-objs += micscif_sysfs.o
+micscif-objs += micscif_smpt.o
+micscif-objs += micscif_intr.o
+micscif-objs += micscif_api.o
+micscif-objs += micscif_fd.o
+micscif-objs += micscif_nodeqp.o
+micscif-objs += micscif_va_node.o
+micscif-objs += micscif_va_gen.o
+micscif-objs += micscif_rma.o
+micscif-objs += micscif_rma_list.o
+micscif-objs += micscif_rma_dma.o
+micscif-objs += micscif_debug.o
+micscif-objs += micscif_ports.o
+micscif-objs += micscif_select.o
+micscif-objs += micscif_nm.o
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <linux/poll.h>
+#include <linux/time.h>
+#include <linux/ktime.h>
+#include <linux/sched.h>
+#include <linux/kref.h>
+#include <linux/module.h>
+#include "scif.h"
+#include "mic/micscif.h"
+#ifndef _MIC_SCIF_
+#include "mic_common.h"
+#endif
+#include "mic/micscif_map.h"
+
+#define SCIF_MAP_ULIMIT 0x40
+
+bool mic_ulimit_check = 0;
+
+char *scif_ep_states[] = {
+ "Closed",
+ "Unbound",
+ "Bound",
+ "Listening",
+ "Connected",
+ "Connecting",
+ "Mapping",
+ "Closing",
+ "Close Listening",
+ "Disconnected",
+ "Zombie"};
+
+enum conn_async_state {
+ ASYNC_CONN_IDLE = 1, /* ep setup for async connect */
+ ASYNC_CONN_INPROGRESS, /* async connect in progress */
+ ASYNC_CONN_FLUSH_WORK /* async work flush in progress */
+};
+
+/**
+ * scif_open() - Create a SCIF end point
+ *
+ * Create a SCIF end point and set the state to UNBOUND. This function
+ * returns the address of the end point data structure.
+ */
+scif_epd_t
+__scif_open(void)
+{
+ struct endpt *ep;
+
+ might_sleep();
+ if ((ep = (struct endpt *)kzalloc(sizeof(struct endpt), GFP_KERNEL)) == NULL) {
+ printk(KERN_ERR "SCIFAPI open: kzalloc fail on scif end point descriptor\n");
+ goto err_ep_alloc;
+ }
+
+ if ((ep->qp_info.qp = (struct micscif_qp *)
+ kzalloc(sizeof(struct micscif_qp), GFP_KERNEL)) == NULL) {
+ printk(KERN_ERR "SCIFAPI open: kzalloc fail on scif end point queue pointer\n");
+ goto err_qp_alloc;
+ }
+
+ spin_lock_init(&ep->lock);
+ mutex_init (&ep->sendlock);
+ mutex_init (&ep->recvlock);
+
+ if (micscif_rma_ep_init(ep) < 0) {
+ printk(KERN_ERR "SCIFAPI _open: RMA EP Init failed\n");
+ goto err_rma_init;
+ }
+
+ ep->state = SCIFEP_UNBOUND;
+ pr_debug("SCIFAPI open: ep %p success\n", ep);
+ return (scif_epd_t)ep;
+
+err_rma_init:
+ kfree(ep->qp_info.qp);
+err_qp_alloc:
+ kfree(ep);
+err_ep_alloc:
+ return NULL;
+}
+
+scif_epd_t
+scif_open(void)
+{
+ struct endpt *ep;
+ ep = (struct endpt *)__scif_open();
+ if (ep)
+ kref_init(&(ep->ref_count));
+ return (scif_epd_t)ep;
+}
+EXPORT_SYMBOL(scif_open);
+
+/**
+ * scif_close() - Terminate a SCIF end point
+ * @epd: The end point address returned from scif_open()
+ *
+ * The function terminates a scif connection. It must ensure all traffic on
+ * the connection is finished before removing it.
+ *
+ * On Connection with memory mapped this become more difficult. Once normal
+ * DMA and message traffic has ended the end point must be placed in a zombie
+ * state and wait for the other side to also release it's memory references.
+ */
+int
+__scif_close(scif_epd_t epd)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ struct endpt *tmpep;
+ struct list_head *pos, *tmpq;
+ unsigned long sflags;
+ enum endptstate oldstate;
+ int err;
+ bool flush_conn;
+
+ pr_debug("SCIFAPI close: ep %p %s\n", ep, scif_ep_states[ep->state]);
+
+ might_sleep();
+
+ spin_lock(&ep->lock);
+ flush_conn = (ep->conn_async_state == ASYNC_CONN_INPROGRESS);
+ spin_unlock(&ep->lock);
+
+ if (flush_conn)
+ flush_workqueue(ms_info.mi_conn_wq);
+
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+
+ spin_lock_irqsave(&ep->lock, sflags);
+ oldstate = ep->state;
+
+ ep->state = SCIFEP_CLOSING;
+
+ switch (oldstate) {
+ case SCIFEP_ZOMBIE:
+ BUG_ON(SCIFEP_ZOMBIE == oldstate);
+ case SCIFEP_CLOSED:
+ case SCIFEP_DISCONNECTED:
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ micscif_unregister_all_windows(epd);
+ // Remove from the disconnected list
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) {
+ tmpep = list_entry(pos, struct endpt, list);
+ if (tmpep == ep) {
+ list_del(pos);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+ break;
+ case SCIFEP_UNBOUND:
+ case SCIFEP_BOUND:
+ case SCIFEP_CONNECTING:
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ break;
+ case SCIFEP_MAPPING:
+ case SCIFEP_CONNECTED:
+ case SCIFEP_CLOSING:
+ {
+ struct nodemsg msg;
+ struct endpt *fep = NULL;
+ struct endpt *tmpep;
+ unsigned long ts = jiffies;
+ struct list_head *pos, *tmpq;
+
+ // Very short time before mapping completes and state becomes connected
+ // and does a standard teardown.
+ ts = jiffies;
+ while (ep->state == SCIFEP_MAPPING) {
+ cpu_relax();
+ if (time_after((unsigned long)jiffies,ts + NODE_ALIVE_TIMEOUT)) {
+ printk(KERN_ERR "%s %d ep->state %d\n", __func__, __LINE__, ep->state);
+ ep->state = SCIFEP_BOUND;
+ break;
+ }
+ }
+
+ init_waitqueue_head(&ep->disconwq); // Wait for connection queue
+ spin_unlock_irqrestore(&ep->lock, sflags);
+
+ micscif_unregister_all_windows(epd);
+
+ // Remove from the connected list
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ list_for_each_safe(pos, tmpq, &ms_info.mi_connected) {
+ tmpep = list_entry(pos, struct endpt, list);
+ if (tmpep == ep) {
+ list_del(pos);
+ put_conn_count(ep->remote_dev);
+ fep = tmpep;
+ spin_lock(&ep->lock);
+ break;
+ }
+ }
+
+ if (fep == NULL) {
+ // The other side has completed the disconnect before
+ // the end point can be removed from the list. Therefore
+ // the ep lock is not locked, traverse the disconnected list
+ // to find the endpoint, release the conn lock and
+ // proceed to teardown the end point below.
+ list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) {
+ tmpep = list_entry(pos, struct endpt, list);
+ if (tmpep == ep) {
+ list_del(pos);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+ break;
+ }
+
+ spin_unlock(&ms_info.mi_connlock);
+
+ // Now we are free to close out the connection
+ msg.uop = SCIF_DISCNCT;
+ msg.src = ep->port;
+ msg.dst = ep->peer;
+ msg.payload[0] = (uint64_t)ep;
+ msg.payload[1] = ep->remote_ep;
+
+ err = micscif_nodeqp_send(ep->remote_dev, &msg, ep);
+ spin_unlock_irqrestore(&ep->lock, sflags);
+
+ if (!err)
+ /* Now wait for the remote node to respond */
+ wait_event_timeout(ep->disconwq,
+ (ep->state == SCIFEP_DISCONNECTED), NODE_ALIVE_TIMEOUT);
+ /*
+ * Grab and release the ep lock to synchronize with the
+ * thread waking us up. If we dont grab this lock, then
+ * the ep might be freed before the wakeup completes
+ * resulting in potential memory corruption.
+ */
+ spin_lock_irqsave(&ep->lock, sflags);
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ break;
+ }
+ case SCIFEP_LISTENING:
+ case SCIFEP_CLLISTEN:
+ {
+ struct conreq *conreq;
+ struct nodemsg msg;
+ struct endpt *aep;
+
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ spin_lock_irqsave(&ms_info.mi_eplock, sflags);
+
+ // remove from listen list
+ list_for_each_safe(pos, tmpq, &ms_info.mi_listen) {
+ tmpep = list_entry(pos, struct endpt, list);
+ if (tmpep == ep) {
+ list_del(pos);
+ }
+ }
+ // Remove any dangling accepts
+ while (ep->acceptcnt) {
+ aep = list_first_entry(&ep->li_accept, struct endpt, liacceptlist);
+ BUG_ON(!aep);
+ list_del(&aep->liacceptlist);
+ if (aep->port.port && !aep->accepted_ep)
+ put_scif_port(aep->port.port);
+ list_for_each_safe(pos, tmpq, &ms_info.mi_uaccept) {
+ tmpep = list_entry(pos, struct endpt, miacceptlist);
+ if (tmpep == aep) {
+ list_del(pos);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ list_for_each_safe(pos, tmpq, &ms_info.mi_connected) {
+ tmpep = list_entry(pos, struct endpt, list);
+ if (tmpep == aep) {
+ list_del(pos);
+ put_conn_count(aep->remote_dev);
+ break;
+ }
+ }
+ list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) {
+ tmpep = list_entry(pos, struct endpt, list);
+ if (tmpep == aep) {
+ list_del(pos);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+ micscif_teardown_ep(aep);
+ spin_lock_irqsave(&ms_info.mi_eplock, sflags);
+ micscif_add_epd_to_zombie_list(aep, MI_EPLOCK_HELD);
+ ep->acceptcnt--;
+ }
+
+ spin_lock(&ep->lock);
+ spin_unlock(&ms_info.mi_eplock);
+
+ // Remove and reject any pending connection requests.
+ while (ep->conreqcnt) {
+ conreq = list_first_entry(&ep->conlist, struct conreq, list);
+ list_del(&conreq->list);
+
+ msg.uop = SCIF_CNCT_REJ;
+ msg.dst.node = conreq->msg.src.node;
+ msg.dst.port = conreq->msg.src.port;
+ msg.payload[0] = conreq->msg.payload[0];
+ msg.payload[1] = conreq->msg.payload[1];
+ /*
+ * No Error Handling on purpose for micscif_nodeqp_send().
+ * If the remote node is lost we still want free the connection
+ * requests on the self node.
+ */
+ micscif_nodeqp_send(&scif_dev[conreq->msg.src.node], &msg, ep);
+
+ ep->conreqcnt--;
+ kfree(conreq);
+ }
+
+ // If a kSCIF accept is waiting wake it up
+ wake_up_interruptible(&ep->conwq);
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ break;
+ }
+ }
+ if (ep->port.port && !ep->accepted_ep)
+ put_scif_port(ep->port.port);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ micscif_teardown_ep(ep);
+ micscif_add_epd_to_zombie_list(ep, !MI_EPLOCK_HELD);
+ return 0;
+}
+
+void
+scif_ref_rel(struct kref *kref_count)
+{
+ struct endpt *epd;
+ epd = container_of(kref_count, struct endpt, ref_count);
+ __scif_close((scif_epd_t)epd);
+}
+
+int
+scif_close(scif_epd_t epd)
+{
+ __scif_flush(epd);
+ put_kref_count(epd);
+ return 0;
+}
+EXPORT_SYMBOL(scif_close);
+
+/**
+ * scif_flush() - Flush the endpoint
+ * @epd: The end point address returned from scif_open()
+ *
+ */
+int
+__scif_flush(scif_epd_t epd)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ struct endpt *tmpep;
+ struct list_head *pos, *tmpq;
+ unsigned long sflags;
+ int err;
+
+ might_sleep();
+
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+
+ spin_lock_irqsave(&ep->lock, sflags);
+
+ switch (ep->state) {
+ case SCIFEP_CONNECTED:
+ {
+ struct nodemsg msg;
+ struct endpt *fep = NULL;
+
+ init_waitqueue_head(&ep->disconwq); // Wait for connection queue
+ WARN_ON(ep->files); // files should never be set while connected
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+
+ list_for_each_safe(pos, tmpq, &ms_info.mi_connected) {
+ tmpep = list_entry(pos, struct endpt, list);
+ if (tmpep == ep) {
+ list_del(pos);
+ put_conn_count(ep->remote_dev);
+ fep = tmpep;
+ spin_lock(&ep->lock);
+ break;
+ }
+ }
+
+ if (fep == NULL) {
+ // The other side has completed the disconnect before
+ // the end point can be removed from the list. Therefore
+ // the ep lock is not locked, traverse the disconnected list
+ // to find the endpoint, release the conn lock.
+ list_for_each_safe(pos, tmpq, &ms_info.mi_disconnected) {
+ tmpep = list_entry(pos, struct endpt, list);
+ if (tmpep == ep) {
+ list_del(pos);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+ break;
+ }
+
+ spin_unlock(&ms_info.mi_connlock);
+
+ msg.uop = SCIF_DISCNCT;
+ msg.src = ep->port;
+ msg.dst = ep->peer;
+ msg.payload[0] = (uint64_t)ep;
+ msg.payload[1] = ep->remote_ep;
+
+ err = micscif_nodeqp_send(ep->remote_dev, &msg, ep);
+
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ if (!err)
+ /* Now wait for the remote node to respond */
+ wait_event_timeout(ep->disconwq,
+ (ep->state == SCIFEP_DISCONNECTED), NODE_ALIVE_TIMEOUT);
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ spin_lock(&ep->lock);
+ list_add_tail(&ep->list, &ms_info.mi_disconnected);
+ ep->state = SCIFEP_DISCONNECTED;
+ spin_unlock(&ep->lock);
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+ // Wake up threads blocked in send and recv
+ wake_up_interruptible(&ep->sendwq);
+ wake_up_interruptible(&ep->recvwq);
+ break;
+ }
+ case SCIFEP_LISTENING:
+ {
+ ep->state = SCIFEP_CLLISTEN;
+
+ // If an accept is waiting wake it up
+ wake_up_interruptible(&ep->conwq);
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ break;
+ }
+ default:
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ break;
+ }
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ return 0;
+}
+
+/**
+ * scif_bind() - Bind a SCIF end point to a port ID.
+ * @epd: The end point address returned from scif_open()
+ * @pn: Port ID (number) to bind to
+ *
+ * Set the port ID associated with the end point and place it in the bound state.
+ * If a port ID of zero is requested a non zero port ID is allocated for it.
+ *
+ * Upon successful compltion the port id (number) will be returned.
+ *
+ * If the end point is not in the unbound state then return -EISCONN.
+ *
+ * If port ID zero is specified and allocation of a port ID fails -ENOSPC
+ * will be returned.
+ */
+int
+__scif_bind(scif_epd_t epd, uint16_t pn)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ unsigned long sflags;
+ int ret = 0;
+ int tmp;
+
+ pr_debug("SCIFAPI bind: ep %p %s requested port number %d\n",
+ ep, scif_ep_states[ep->state], pn);
+
+ might_sleep();
+
+ if (pn) {
+ /*
+ * Modeled on http://www.ietf.org/rfc/rfc1700.txt?number=1700
+ * SCIF ports below SCIF_ADMIN_PORT_END can only be bound by
+ * system (or root) processes or by processes executed by
+ * privileged users.
+ */
+ if ( pn < SCIF_ADMIN_PORT_END && !capable(CAP_SYS_ADMIN)) {
+ ret = -EACCES;
+ goto scif_bind_admin_exit;
+ }
+ }
+
+ spin_lock_irqsave(&ep->lock, sflags);
+ if (ep->state == SCIFEP_BOUND) {
+ ret = -EINVAL;
+ goto scif_bind_exit;
+ } else if (ep->state != SCIFEP_UNBOUND) {
+ ret = -EISCONN;
+ goto scif_bind_exit;
+ }
+
+ if (pn) {
+ if ((tmp = rsrv_scif_port(pn)) != pn) {
+ ret = -EINVAL;
+ goto scif_bind_exit;
+ }
+ } else {
+ pn = get_scif_port();
+ if (!pn) {
+ ret = -ENOSPC;
+ goto scif_bind_exit;
+ }
+ }
+
+ ep->state = SCIFEP_BOUND;
+ ep->port.node = ms_info.mi_nodeid;
+ ep->port.port = pn;
+ ep->conn_async_state = ASYNC_CONN_IDLE;
+ ret = pn;
+ pr_debug("SCIFAPI bind: bound to port number %d\n", pn);
+
+scif_bind_exit:
+ spin_unlock_irqrestore(&ep->lock, sflags);
+scif_bind_admin_exit:
+ return ret;
+}
+
+int
+scif_bind(scif_epd_t epd, uint16_t pn)
+{
+ int ret;
+ get_kref_count(epd);
+ ret = __scif_bind(epd, pn);
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_bind);
+
+/**
+ * scif_listen() - Place the end point in the listening state
+ * @epd: The end point address returned from scif_open()
+ * @backlog: Maximum number of pending connection requests.
+ *
+ * The end point is placed in the listening state ready to accept connection
+ * requests. The backlog paramter is saved to indicate the maximun number of
+ * connection requests from the remote node to save. The end point is
+ * placed on a list of listening end points to allow a connection request to
+ * find it.
+ *
+ * Upon successful completion a zero is returned.
+ *
+ * If the end point is not in the bound state -EINVAL or -EISCONN is returned.
+ *
+ */
+int
+__scif_listen(scif_epd_t epd, int backlog)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ unsigned long sflags;
+
+ pr_debug("SCIFAPI listen: ep %p %s\n", ep, scif_ep_states[ep->state]);
+
+ might_sleep();
+ spin_lock_irqsave(&ep->lock, sflags);
+ switch (ep->state) {
+ case SCIFEP_ZOMBIE:
+ BUG_ON(SCIFEP_ZOMBIE == ep->state);
+ case SCIFEP_CLOSED:
+ case SCIFEP_CLOSING:
+ case SCIFEP_CLLISTEN:
+ case SCIFEP_UNBOUND:
+ case SCIFEP_DISCONNECTED:
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ return -EINVAL;
+ case SCIFEP_LISTENING:
+ case SCIFEP_CONNECTED:
+ case SCIFEP_CONNECTING:
+ case SCIFEP_MAPPING:
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ return -EISCONN;
+ case SCIFEP_BOUND:
+ break;
+ }
+
+ ep->state = SCIFEP_LISTENING;
+ ep->backlog = backlog;
+
+ ep->conreqcnt = 0;
+ ep->acceptcnt = 0;
+ INIT_LIST_HEAD(&ep->conlist); // List of connection requests
+ init_waitqueue_head(&ep->conwq); // Wait for connection queue
+ INIT_LIST_HEAD(&ep->li_accept); // User ep list for ACCEPTREG calls
+ spin_unlock_irqrestore(&ep->lock, sflags);
+
+ // Listen status is complete so delete the qp information not needed
+ // on a listen before placing on the list of listening ep's
+ micscif_teardown_ep((void *)ep);
+ ep->qp_info.qp = NULL;
+
+ spin_lock_irqsave(&ms_info.mi_eplock, sflags);
+ list_add_tail(&ep->list, &ms_info.mi_listen);
+ spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
+ return 0;
+}
+
+int
+scif_listen(scif_epd_t epd, int backlog)
+{
+ int ret;
+ get_kref_count(epd);
+ ret = __scif_listen(epd, backlog);
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_listen);
+
+#ifdef _MIC_SCIF_
+/*
+ * scif_p2p_connect:
+ * @node: destination node id
+ *
+ * Try to setup a p2p connection between the current
+ * node and the desitination node. We need host to
+ * setup the initial p2p connections. So we send
+ * this message to the host which acts like proxy
+ * in setting up p2p connection.
+ */
+static int scif_p2p_connect(int node)
+{
+ struct micscif_dev *remote_dev = &scif_dev[node];
+ struct nodemsg msg;
+ int err;
+
+ pr_debug("%s:%d SCIF_NODE_CONNECT to host\n", __func__, __LINE__);
+ micscif_inc_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1);
+
+ msg.dst.node = SCIF_HOST_NODE;
+ msg.payload[0] = node;
+ msg.uop = SCIF_NODE_CONNECT;
+
+ if ((err = micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE],
+ &msg, NULL))) {
+ printk(KERN_ERR "%s:%d error while sending SCIF_NODE_CONNECT to"
+ " node %d\n", __func__, __LINE__, node);
+ micscif_dec_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1);
+ goto error;
+ }
+
+ wait_event_interruptible_timeout(remote_dev->sd_p2p_wq,
+ (remote_dev->sd_state == SCIFDEV_RUNNING) ||
+ (remote_dev->sd_state == SCIFDEV_NOTPRESENT), NODE_ALIVE_TIMEOUT);
+
+ pr_debug("%s:%d SCIF_NODE_CONNECT state:%d\n", __func__, __LINE__,
+ remote_dev->sd_state);
+ micscif_dec_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1);
+error:
+ return err;
+}
+#endif
+
+static int scif_conn_func(struct endpt *ep)
+{
+ int err = 0;
+ struct nodemsg msg;
+ unsigned long sflags;
+ int term_sent = 0;
+
+ if ((err = micscif_reserve_dma_chan(ep))) {
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ ep->state = SCIFEP_BOUND;
+ goto connect_error_simple;
+ }
+ // Initiate the first part of the endpoint QP setup
+ err = micscif_setup_qp_connect(ep->qp_info.qp, &ep->qp_info.qp_offset,
+ ENDPT_QP_SIZE, ep->remote_dev);
+ if (err) {
+ printk(KERN_ERR "%s err %d qp_offset 0x%llx\n",
+ __func__, err, ep->qp_info.qp_offset);
+ ep->state = SCIFEP_BOUND;
+ goto connect_error_simple;
+ }
+
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+
+ // Format connect message and send it
+ msg.src = ep->port;
+ msg.dst = ep->conn_port;
+ msg.uop = SCIF_CNCT_REQ;
+ msg.payload[0] = (uint64_t)ep;
+ msg.payload[1] = ep->qp_info.qp_offset;
+ if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) {
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ goto connect_error_simple;
+ }
+ // Wait for request to be processed.
+ while ((err = wait_event_interruptible_timeout(ep->conwq,
+ (ep->state != SCIFEP_CONNECTING), NODE_ALIVE_TIMEOUT)) <= 0) {
+ if (!err)
+ err = -ENODEV;
+
+ pr_debug("SCIFAPI connect: ep %p ^C detected\n", ep);
+ // interrupted out of the wait
+ if (!term_sent++) {
+ int bak_err = err;
+ msg.uop = SCIF_CNCT_TERM;
+ if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) {
+retry:
+ err = wait_event_timeout(ep->diswq,
+ (ep->state != SCIFEP_CONNECTING), NODE_ALIVE_TIMEOUT);
+ if (!err && scifdev_alive(ep))
+ goto retry;
+ if (!err)
+ err = -ENODEV;
+ if (err > 0)
+ err = 0;
+ }
+ if (ep->state == SCIFEP_MAPPING) {
+ micscif_setup_qp_connect_response(ep->remote_dev,
+ ep->qp_info.qp, ep->qp_info.cnct_gnt_payload);
+ // Send grant nack
+ msg.uop = SCIF_CNCT_GNTNACK;
+ msg.payload[0] = ep->remote_ep;
+ /* No error handling for Notification messages */
+ micscif_nodeqp_send(ep->remote_dev, &msg, ep);
+ }
+ // Ensure after that even after a timeout the state of the end point is bound
+ ep->state = SCIFEP_BOUND;
+ if (bak_err)
+ err = bak_err;
+ break;
+ }
+ }
+
+ if (err > 0)
+ err = 0;
+
+ if (term_sent || err) {
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ goto connect_error_simple;
+ }
+
+ if (ep->state == SCIFEP_MAPPING) {
+ err = micscif_setup_qp_connect_response(ep->remote_dev,
+ ep->qp_info.qp, ep->qp_info.cnct_gnt_payload);
+
+ // If the resource to map the queue are not available then we need
+ // to tell the other side to terminate the accept
+ if (err) {
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+
+ // Send grant nack
+ msg.uop = SCIF_CNCT_GNTNACK;
+ msg.payload[0] = ep->remote_ep;
+ /* No error handling for Notification messages */
+ micscif_nodeqp_send(ep->remote_dev, &msg, ep);
+
+ ep->state = SCIFEP_BOUND;
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ goto connect_error_simple;
+ }
+
+ // Send a grant ack to inform the accept we are done mapping its resources.
+ msg.uop = SCIF_CNCT_GNTACK;
+ msg.payload[0] = ep->remote_ep;
+ if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) {
+ ep->state = SCIFEP_CONNECTED;
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ list_add_tail(&ep->list, &ms_info.mi_connected);
+ get_conn_count(ep->remote_dev);
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+ pr_debug("SCIFAPI connect: ep %p connected\n", ep);
+ } else
+ ep->state = SCIFEP_BOUND;
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ goto connect_error_simple;
+
+ } else if (ep->state == SCIFEP_BOUND) {
+ pr_debug("SCIFAPI connect: ep %p connection refused\n", ep);
+ err = -ECONNREFUSED;
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ goto connect_error_simple;
+
+ } else {
+ pr_debug("SCIFAPI connect: ep %p connection interrupted\n", ep);
+ err = -EINTR;
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ goto connect_error_simple;
+ }
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+connect_error_simple:
+ return err;
+}
+
+/*
+ * micscif_conn_handler:
+ *
+ * Workqueue handler for servicing non-blocking SCIF connect
+ *
+ */
+void micscif_conn_handler(struct work_struct *work)
+{
+ struct endpt *ep;
+
+ do {
+ ep = NULL;
+ spin_lock(&ms_info.mi_nb_connect_lock);
+ if (!list_empty(&ms_info.mi_nb_connect_list)) {
+ ep = list_first_entry(&ms_info.mi_nb_connect_list,
+ struct endpt, conn_list);
+ list_del(&ep->conn_list);
+ }
+ spin_unlock(&ms_info.mi_nb_connect_lock);
+ if (ep) {
+ ep->conn_err = scif_conn_func(ep);
+ wake_up_interruptible(&ep->conn_pend_wq);
+ }
+ } while (ep);
+}
+
+/**
+ * scif_connect() - Request a connection to a remote node
+ * @epd: The end point address returned from scif_open()
+ * @dst: Remote note address informtion
+ *
+ * The function requests a scif connection to the remote node
+ * identified by the dst parameter. "dst" contains the remote node and
+ * port ids.
+ *
+ * Upon successful complete a zero will be returned.
+ *
+ * If the end point is not in the bound state -EINVAL will be returned.
+ *
+ * If during the connection sequence resource allocation fails the -ENOMEM
+ * will be returned.
+ *
+ * If the remote side is not responding to connection requests the caller may
+ * terminate this funciton with a signal. If so a -EINTR will be returned.
+ */
+int
+__scif_connect(scif_epd_t epd, struct scif_portID *dst, bool non_block)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ unsigned long sflags;
+ int err = 0;
+#ifdef _MIC_SCIF_
+ struct micscif_dev *remote_dev;
+#endif
+
+ pr_debug("SCIFAPI connect: ep %p %s\n", ep,
+ scif_ep_states[ep->state]);
+
+ if (dst->node > MAX_BOARD_SUPPORTED)
+ return -ENODEV;
+
+ might_sleep();
+
+#ifdef _MIC_SCIF_
+ remote_dev = &scif_dev[dst->node];
+ if ((SCIFDEV_INIT == remote_dev->sd_state ||
+ SCIFDEV_STOPPED == remote_dev->sd_state) && mic_p2p_enable)
+ if ((err = scif_p2p_connect(dst->node)))
+ return err;
+#endif
+
+ if (SCIFDEV_RUNNING != scif_dev[dst->node].sd_state &&
+ SCIFDEV_SLEEPING != scif_dev[dst->node].sd_state)
+ return -ENODEV;
+
+ spin_lock_irqsave(&ep->lock, sflags);
+ switch (ep->state) {
+ case SCIFEP_ZOMBIE:
+ BUG_ON(SCIFEP_ZOMBIE == ep->state);
+
+ case SCIFEP_CLOSED:
+ case SCIFEP_CLOSING:
+ err = -EINVAL;
+ break;
+
+ case SCIFEP_DISCONNECTED:
+ if (ep->conn_async_state == ASYNC_CONN_INPROGRESS)
+ ep->conn_async_state = ASYNC_CONN_FLUSH_WORK;
+ else
+ err = -EINVAL;
+ break;
+
+ case SCIFEP_LISTENING:
+ case SCIFEP_CLLISTEN:
+ err = -EOPNOTSUPP;
+ break;
+
+ case SCIFEP_CONNECTING:
+ case SCIFEP_MAPPING:
+ if (ep->conn_async_state == ASYNC_CONN_INPROGRESS)
+ err = -EINPROGRESS;
+ else
+ err = -EISCONN;
+ break;
+
+ case SCIFEP_CONNECTED:
+ if (ep->conn_async_state == ASYNC_CONN_INPROGRESS)
+ ep->conn_async_state = ASYNC_CONN_FLUSH_WORK;
+ else
+ err = -EISCONN;
+ break;
+
+ case SCIFEP_UNBOUND:
+ if ((ep->port.port = get_scif_port()) == 0)
+ err = -ENOSPC;
+ else {
+ ep->port.node = ms_info.mi_nodeid;
+ ep->conn_async_state = ASYNC_CONN_IDLE;
+ }
+ /* Fall through */
+ case SCIFEP_BOUND:
+ /*
+ * If a non-blocking connect has been already initiated (conn_async_state
+ * is either ASYNC_CONN_INPROGRESS or ASYNC_CONN_FLUSH_WORK), the end point
+ * could end up in SCIF_BOUND due an error in the connection
+ * process (e.g., connnection refused)
+ * If conn_async_state is ASYNC_CONN_INPROGRESS - transition to
+ * ASYNC_CONN_FLUSH_WORK so that the error status can be collected.
+ * If the state is already ASYNC_CONN_FLUSH_WORK - then set the error
+ * to EINPROGRESS since some other thread is waiting to collect error status.
+ */
+ if (ep->conn_async_state == ASYNC_CONN_INPROGRESS)
+ ep->conn_async_state = ASYNC_CONN_FLUSH_WORK;
+ else if (ep->conn_async_state == ASYNC_CONN_FLUSH_WORK)
+ err = -EINPROGRESS;
+ else {
+ ep->conn_port = *dst;
+ init_waitqueue_head(&ep->sendwq);
+ init_waitqueue_head(&ep->recvwq);
+ init_waitqueue_head(&ep->conwq);
+ init_waitqueue_head(&ep->diswq);
+ ep->conn_async_state = 0;
+
+ if (unlikely(non_block))
+ ep->conn_async_state = ASYNC_CONN_INPROGRESS;
+ }
+ break;
+ }
+
+ if (err || ep->conn_async_state == ASYNC_CONN_FLUSH_WORK)
+ goto connect_simple_unlock1;
+
+ ep->state = SCIFEP_CONNECTING;
+ ep->remote_dev = &scif_dev[dst->node];
+ ep->sd_state = SCIFDEV_RUNNING;
+ ep->qp_info.qp->magic = SCIFEP_MAGIC;
+ ep->qp_info.qp->ep = (uint64_t)ep;
+ if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) {
+ init_waitqueue_head(&ep->conn_pend_wq);
+ spin_lock(&ms_info.mi_nb_connect_lock);
+ list_add_tail(&ep->conn_list,
+ &ms_info.mi_nb_connect_list);
+ spin_unlock(&ms_info.mi_nb_connect_lock);
+ err = -EINPROGRESS;
+ queue_work(ms_info.mi_conn_wq, &ms_info.mi_conn_work);
+ }
+connect_simple_unlock1:
+ spin_unlock_irqrestore(&ep->lock, sflags);
+
+ if (err)
+ return err;
+ else if (ep->conn_async_state == ASYNC_CONN_FLUSH_WORK) {
+ flush_workqueue(ms_info.mi_conn_wq);
+ err = ep->conn_err;
+ spin_lock_irqsave(&ep->lock, sflags);
+ ep->conn_async_state = ASYNC_CONN_IDLE;
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ } else {
+ err = scif_conn_func(ep);
+ }
+ return err;
+}
+
+int
+scif_connect(scif_epd_t epd, struct scif_portID *dst)
+{
+ int ret;
+ get_kref_count(epd);
+ ret = __scif_connect(epd, dst, false);
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_connect);
+
+/**
+ * scif_accept() - Accept a connection request from the remote node
+ * @epd: The end point address returned from scif_open()
+ * @peer: Filled in with pear node and port information
+ * @newepd: New end point created for connection
+ * @flags: Indicates sychronous or asynchronous mode
+ *
+ * The function accepts a connection request from the remote node. Successful
+ * complete is indicate by a new end point being created and passed back
+ * to the caller for future reference.
+ *
+ * Upon successful complete a zero will be returned and the peer information
+ * will be filled in.
+ *
+ * If the end point is not in the listening state -EINVAL will be returned.
+ *
+ * If during the connection sequence resource allocation fails the -ENOMEM
+ * will be returned.
+ *
+ * If the function is called asynchronously and not connection request are
+ * pending it will return -EAGAIN.
+ *
+ * If the remote side is not sending any connection requests the caller may
+ * terminate this funciton with a signal. If so a -EINTR will be returned.
+ */
+int
+__scif_accept(scif_epd_t epd, struct scif_portID *peer, scif_epd_t *newepd, int flags)
+{
+ struct endpt *lep = (struct endpt *)epd;
+ struct endpt *cep;
+ struct conreq *conreq;
+ struct nodemsg msg;
+ unsigned long sflags;
+ int err;
+
+ pr_debug("SCIFAPI accept: ep %p %s\n", lep, scif_ep_states[lep->state]);
+
+ // Error if flags other than SCIF_ACCEPT_SYNC are set
+ if (flags & ~SCIF_ACCEPT_SYNC) {
+ pr_debug("SCIFAPI accept: ep %p invalid flags %x\n", lep, flags & ~SCIF_ACCEPT_SYNC);
+ return -EINVAL;
+ }
+
+ if (!peer || !newepd) {
+ pr_debug("SCIFAPI accept: ep %p peer %p or newepd %p NULL\n",
+ lep, peer, newepd);
+ return -EINVAL;
+ }
+
+ might_sleep();
+ spin_lock_irqsave(&lep->lock, sflags);
+ if (lep->state != SCIFEP_LISTENING) {
+ pr_debug("SCIFAPI accept: ep %p not listending\n", lep);
+ spin_unlock_irqrestore(&lep->lock, sflags);
+ return -EINVAL;
+ }
+
+ if (!lep->conreqcnt && !(flags & SCIF_ACCEPT_SYNC)) {
+ // No connection request present and we do not want to wait
+ pr_debug("SCIFAPI accept: ep %p async request with nothing pending\n", lep);
+ spin_unlock_irqrestore(&lep->lock, sflags);
+ return -EAGAIN;
+ }
+
+retry_connection:
+ spin_unlock_irqrestore(&lep->lock, sflags);
+ lep->files = current ? current->files : NULL;
+ if ((err = wait_event_interruptible(lep->conwq,
+ (lep->conreqcnt || (lep->state != SCIFEP_LISTENING)))) != 0) {
+ // wait was interrupted
+ pr_debug("SCIFAPI accept: ep %p ^C detected\n", lep);
+ return err; // -ERESTARTSYS
+ }
+
+ if (lep->state != SCIFEP_LISTENING) {
+ return -EINTR;
+ }
+
+ spin_lock_irqsave(&lep->lock, sflags);
+
+ if (!lep->conreqcnt) {
+ goto retry_connection;
+ }
+
+ // Get the first connect request off the list
+ conreq = list_first_entry(&lep->conlist, struct conreq, list);
+ list_del(&conreq->list);
+ lep->conreqcnt--;
+ spin_unlock_irqrestore(&lep->lock, sflags);
+
+ // Fill in the peer information
+ peer->node = conreq->msg.src.node;
+ peer->port = conreq->msg.src.port;
+
+ // Create the connection endpoint
+ cep = (struct endpt *)kzalloc(sizeof(struct endpt), GFP_KERNEL);
+ if (!cep) {
+ pr_debug("SCIFAPI accept: ep %p new end point allocation failed\n", lep);
+ err = -ENOMEM;
+ goto scif_accept_error_epalloc;
+ }
+ spin_lock_init(&cep->lock);
+ mutex_init (&cep->sendlock);
+ mutex_init (&cep->recvlock);
+ cep->state = SCIFEP_CONNECTING;
+ cep->remote_dev = &scif_dev[peer->node];
+ cep->remote_ep = conreq->msg.payload[0];
+ cep->sd_state = SCIFDEV_RUNNING;
+
+ if (!scifdev_alive(cep)) {
+ err = -ENODEV;
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ goto scif_accept_error_qpalloc;
+ }
+
+ if (micscif_rma_ep_init(cep) < 0) {
+ pr_debug("SCIFAPI accept: ep %p new %p RMA EP init failed\n", lep, cep);
+ err = -ENOMEM;
+ goto scif_accept_error_qpalloc;
+ }
+
+ if ((err = micscif_reserve_dma_chan(cep))) {
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ goto scif_accept_error_qpalloc;
+ }
+
+ cep->qp_info.qp = (struct micscif_qp *)kzalloc(sizeof(struct micscif_qp), GFP_KERNEL);
+ if (!cep->qp_info.qp) {
+ printk(KERN_ERR "Port Qp Allocation Failed\n");
+ err = -ENOMEM;
+ goto scif_accept_error_qpalloc;
+ }
+
+ cep->qp_info.qp->magic = SCIFEP_MAGIC;
+ cep->qp_info.qp->ep = (uint64_t)cep;
+ micscif_inc_node_refcnt(cep->remote_dev, 1);
+ err = micscif_setup_qp_accept(cep->qp_info.qp, &cep->qp_info.qp_offset,
+ conreq->msg.payload[1], ENDPT_QP_SIZE, cep->remote_dev);
+ if (err) {
+ pr_debug("SCIFAPI accept: ep %p new %p micscif_setup_qp_accept %d qp_offset 0x%llx\n",
+ lep, cep, err, cep->qp_info.qp_offset);
+ micscif_dec_node_refcnt(cep->remote_dev, 1);
+ goto scif_accept_error_map;
+ }
+
+ cep->port.node = lep->port.node;
+ cep->port.port = lep->port.port;
+ cep->peer.node = peer->node;
+ cep->peer.port = peer->port;
+ cep->accepted_ep = true;
+ init_waitqueue_head(&cep->sendwq); // Wait for data to be consumed
+ init_waitqueue_head(&cep->recvwq); // Wait for data to be produced
+ init_waitqueue_head(&cep->conwq); // Wait for connection request
+
+ // Return the grant message
+ msg.uop = SCIF_CNCT_GNT;
+ msg.src = cep->port;
+ msg.payload[0] = cep->remote_ep;
+ msg.payload[1] = cep->qp_info.qp_offset;
+ msg.payload[2] = (uint64_t)cep;
+
+ err = micscif_nodeqp_send(cep->remote_dev, &msg, cep);
+
+ micscif_dec_node_refcnt(cep->remote_dev, 1);
+ if (err)
+ goto scif_accept_error_map;
+retry:
+ err = wait_event_timeout(cep->conwq,
+ (cep->state != SCIFEP_CONNECTING), NODE_ACCEPT_TIMEOUT);
+ if (!err && scifdev_alive(cep))
+ goto retry;
+
+ if (!err) {
+ err = -ENODEV;
+ goto scif_accept_error_map;
+ }
+
+ if (err > 0)
+ err = 0;
+
+ kfree(conreq);
+
+ spin_lock_irqsave(&cep->lock, sflags);
+
+ if (cep->state == SCIFEP_CONNECTED) {
+ // Connect sequence complete return new endpoint information
+ *newepd = (scif_epd_t)cep;
+ spin_unlock_irqrestore(&cep->lock, sflags);
+ pr_debug("SCIFAPI accept: ep %p new %p returning new epnd point\n", lep, cep);
+ return 0;
+ }
+
+ if (cep->state == SCIFEP_CLOSING) {
+ // Remote failed to allocate resources and NAKed the grant.
+ // There is at this point nothing referencing the new end point.
+ spin_unlock_irqrestore(&cep->lock, sflags);
+ micscif_teardown_ep((void *)cep);
+ kfree(cep);
+
+ // If call with sync flag then go back and wait.
+ if (flags & SCIF_ACCEPT_SYNC) {
+ spin_lock_irqsave(&lep->lock, sflags);
+ goto retry_connection;
+ }
+
+ pr_debug("SCIFAPI accept: ep %p new %p remote failed to allocate resources\n", lep, cep);
+ return -EAGAIN;
+ }
+
+ // While connect was in progress the other side closed and sent a disconnect
+ // so set the end point status to closed but return anyway. This will allow
+ // the caller to drain anything the other side may have put in the message queue.
+ *newepd = (scif_epd_t)cep;
+ spin_unlock_irqrestore(&cep->lock, sflags);
+ return 0;
+
+ // Error allocating or mapping resources
+scif_accept_error_map:
+ kfree(cep->qp_info.qp);
+
+scif_accept_error_qpalloc:
+ kfree(cep);
+
+scif_accept_error_epalloc:
+ micscif_inc_node_refcnt(&scif_dev[conreq->msg.src.node], 1);
+ // New reject the connection request due to lack of resources
+ msg.uop = SCIF_CNCT_REJ;
+ msg.dst.node = conreq->msg.src.node;
+ msg.dst.port = conreq->msg.src.port;
+ msg.payload[0] = conreq->msg.payload[0];
+ msg.payload[1] = conreq->msg.payload[1];
+ /* No error handling for Notification messages */
+ micscif_nodeqp_send(&scif_dev[conreq->msg.src.node], &msg, NULL);
+ micscif_dec_node_refcnt(&scif_dev[conreq->msg.src.node], 1);
+
+ kfree(conreq);
+ return err;
+}
+
+int
+scif_accept(scif_epd_t epd, struct scif_portID *peer, scif_epd_t *newepd, int flags)
+{
+ int ret;
+ get_kref_count(epd);
+ ret = __scif_accept(epd, peer, newepd, flags);
+ if (ret == 0) {
+ kref_init(&((*newepd)->ref_count));
+ }
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_accept);
+
+/*
+ * scif_msg_param_check:
+ * @epd: The end point address returned from scif_open()
+ * @len: Length to receive
+ * @flags: Syncronous or asynchronous access
+ *
+ * Validate parameters for messaging APIs scif_send(..)/scif_recv(..).
+ */
+static inline int
+scif_msg_param_check(scif_epd_t epd, int len, int flags)
+{
+ int ret = -EINVAL;
+
+ if (len < 0)
+ goto err_ret;
+
+ if (flags && (!(flags & SCIF_RECV_BLOCK)))
+ goto err_ret;
+
+ ret = 0;
+
+err_ret:
+ return ret;
+}
+
+#define SCIF_BLAST (1 << 1) /* Use bit 1 of flags field */
+
+#ifdef SCIF_BLAST
+/*
+ * Added a temporary implementation of the exception path.
+ * The cost to the normal path is 1 local variable (set once and
+ * tested once) plus 2 tests for the 'blast' flag.
+ * This only apply to the card side kernel API.
+ */
+#ifndef _MIC_SCIF_
+#undef SCIF_BLAST
+#endif
+#endif
+
+/**
+ * _scif_send() - Send data to connection queue
+ * @epd: The end point address returned from scif_open()
+ * @msg: Address to place data
+ * @len: Length to receive
+ * @flags: Syncronous or asynchronous access
+ *
+ * This function sends a packet of data to the queue * created by the
+ * connection establishment sequence. It returns when the packet has
+ * been completely sent.
+ *
+ * Successful completion returns the number of bytes sent.
+ *
+ * If the end point is not in the connect state returns -ENOTCONN;
+ *
+ * This function may be interrupted by a signal and will return -EINTR.
+ */
+int
+_scif_send(scif_epd_t epd, void *msg, int len, int flags)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ struct nodemsg notif_msg;
+ unsigned long sflags;
+ size_t curr_xfer_len = 0;
+ size_t sent_len = 0;
+ size_t write_count;
+ int ret;
+#ifdef SCIF_BLAST
+ int tl;
+#endif
+
+ if (flags & SCIF_SEND_BLOCK)
+ might_sleep();
+
+#ifdef SCIF_BLAST
+ if (flags & SCIF_BLAST) {
+ /*
+ * Do a decent try to acquire lock (~100 uSec)
+ */
+ for (ret = tl = 0; ret < 100 && !tl; ret++) {
+ tl = spin_trylock_irqsave(&ep->lock, sflags);
+ cpu_relax();
+ }
+ } else {
+ tl = 1;
+ spin_lock_irqsave(&ep->lock, sflags);
+ }
+#else
+ spin_lock_irqsave(&ep->lock, sflags);
+#endif
+
+ while (sent_len != len) {
+ if (ep->state == SCIFEP_DISCONNECTED) {
+ ret = (int)(sent_len ? sent_len : -ECONNRESET);
+ goto unlock_dec_return;
+ }
+ if (ep->state != SCIFEP_CONNECTED) {
+ ret = (int)(sent_len ? sent_len : -ENOTCONN);
+ goto unlock_dec_return;
+ }
+ if (!scifdev_alive(ep)) {
+ ret = (int) (sent_len ? sent_len : -ENODEV);
+ goto unlock_dec_return;
+ }
+ write_count = micscif_rb_space(&ep->qp_info.qp->outbound_q);
+ if (write_count) {
+ /*
+ * Best effort to send as much data as there
+ * is space in the RB particularly important for the
+ * Non Blocking case.
+ */
+ curr_xfer_len = min(len - sent_len, write_count);
+ ret = micscif_rb_write(&ep->qp_info.qp->outbound_q, msg,
+ (uint32_t)curr_xfer_len);
+ if (ret < 0) {
+ ret = -EFAULT;
+ goto unlock_dec_return;
+ }
+ if (ret) {
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ /*
+ * If there is space in the RB and we have the
+ * EP lock held then writing to the RB should
+ * succeed. Releasing spin lock before asserting
+ * to avoid deadlocking the system.
+ */
+ BUG_ON(ret);
+ }
+ /*
+ * Success. Update write pointer.
+ */
+ micscif_rb_commit(&ep->qp_info.qp->outbound_q);
+#ifdef SCIF_BLAST
+ if (flags & SCIF_BLAST) {
+ /*
+ * Bypass-path; set flag int the host side node_qp
+ * and ring the doorbell. Host will wake-up all
+ * listeners, such that the message will be seen.
+ * Need micscif_send_host_intr() to be non-static.
+ */
+ extern int micscif_send_host_intr(struct micscif_dev *, uint32_t);
+ ep->remote_dev->qpairs->remote_qp->blast = 1;
+ smp_wmb(); /* Sufficient or need sfence? */
+ micscif_send_host_intr(ep->remote_dev, 0);
+ } else {
+ /*
+ * Normal path: send notification on the
+ * node_qp ring buffer and ring the doorbell.
+ */
+ notif_msg.src = ep->port;
+ notif_msg.uop = SCIF_CLIENT_SENT;
+ notif_msg.payload[0] = ep->remote_ep;
+ if ((ret = micscif_nodeqp_send(ep->remote_dev, ¬if_msg, ep))) {
+ ret = sent_len ? sent_len : ret;
+ goto unlock_dec_return;
+ }
+ }
+#else
+ /*
+ * Send a notification to the peer about the
+ * produced data message.
+ */
+ notif_msg.src = ep->port;
+ notif_msg.uop = SCIF_CLIENT_SENT;
+ notif_msg.payload[0] = ep->remote_ep;
+ if ((ret = micscif_nodeqp_send(ep->remote_dev, ¬if_msg, ep))) {
+ ret = (int)(sent_len ? sent_len : ret);
+ goto unlock_dec_return;
+ }
+#endif
+ sent_len += curr_xfer_len;
+ msg = (char *)msg + curr_xfer_len;
+ continue;
+ }
+ curr_xfer_len = min(len - sent_len, (size_t)(ENDPT_QP_SIZE - 1));
+ /*
+ * Not enough space in the RB. Return in the Non Blocking case.
+ */
+ if (!(flags & SCIF_SEND_BLOCK)) {
+ ret = (int)sent_len;
+ goto unlock_dec_return;
+ }
+#ifdef SCIF_BLAST
+ /*
+ * Flags SCIF_BLAST and SCIF_SEND_BLOCK are mutually
+ * exclusive, so if we get here we know that SCIF_BLAST
+ * was not set and thus we _do_ have the spinlock.
+ * No need to check variable tl here
+ */
+#endif
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ /*
+ * Wait for a message now in the Blocking case.
+ */
+ if ((ret = wait_event_interruptible(ep->sendwq,
+ (SCIFEP_CONNECTED != ep->state) ||
+ (micscif_rb_space(&ep->qp_info.qp->outbound_q)
+ >= curr_xfer_len) || (!scifdev_alive(ep))))) {
+ ret = (int) (sent_len ? sent_len : ret);
+ goto dec_return;
+ }
+ spin_lock_irqsave(&ep->lock, sflags);
+ }
+ ret = len;
+unlock_dec_return:
+#ifdef SCIF_BLAST
+ if (tl)
+#endif
+ spin_unlock_irqrestore(&ep->lock, sflags);
+dec_return:
+ return ret;
+}
+
+/**
+ * _scif_recv() - Recieve data from connection queue
+ * @epd: The end point address returned from scif_open()
+ * @msg: Address to place data
+ * @len: Length to receive
+ * @flags: Syncronous or asynchronous access
+ * @touser: package send to user buffer or kernel
+ *
+ * This function requests to receive a packet of data from the queue
+ * created by the connection establishment sequence. It reads the amount
+ * of data requested before returning.
+ *
+ * This function differs from the scif_send() by also returning data if the
+ * end point is in the disconnected state and data is present.
+ *
+ * Successful completion returns the number of bytes read.
+ *
+ * If the end point is not in the connect state or in the disconnected state
+ * with data prosent it returns -ENOTCONN;
+ *
+ * This function may be interrupted by a signal and will return -EINTR.
+ */
+int
+_scif_recv(scif_epd_t epd, void *msg, int len, int flags)
+{
+ int read_size;
+ struct endpt *ep = (struct endpt *)epd;
+ unsigned long sflags;
+ struct nodemsg notif_msg;
+ size_t curr_recv_len = 0;
+ size_t remaining_len = len;
+ size_t read_count;
+ int ret;
+
+ if (flags & SCIF_RECV_BLOCK)
+ might_sleep();
+
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ spin_lock_irqsave(&ep->lock, sflags);
+ while (remaining_len) {
+ if (ep->state != SCIFEP_CONNECTED &&
+ ep->state != SCIFEP_DISCONNECTED) {
+ ret = (int) (len - remaining_len) ?
+ (int) (len - remaining_len) : -ENOTCONN;
+ goto unlock_dec_return;
+ }
+ read_count = micscif_rb_count(&ep->qp_info.qp->inbound_q,
+ (int) remaining_len);
+ if (read_count) {
+ /*
+ * Best effort to recv as much data as there
+ * are bytes to read in the RB particularly
+ * important for the Non Blocking case.
+ */
+ curr_recv_len = min(remaining_len, read_count);
+ read_size = micscif_rb_get_next(
+ &ep->qp_info.qp->inbound_q,
+ msg, (int) curr_recv_len);
+ if (read_size < 0){
+ /* only could happen when copy to USER buffer
+ */
+ ret = -EFAULT;
+ goto unlock_dec_return;
+ }
+ if (read_size != curr_recv_len) {
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ /*
+ * If there are bytes to be read from the RB and
+ * we have the EP lock held then reading from
+ * RB should succeed. Releasing spin lock before
+ * asserting to avoid deadlocking the system.
+ */
+ BUG_ON(read_size != curr_recv_len);
+ }
+ if (ep->state == SCIFEP_CONNECTED) {
+ /*
+ * Update the read pointer only if the endpoint is
+ * still connected else the read pointer might no
+ * longer exist since the peer has freed resources!
+ */
+ micscif_rb_update_read_ptr(&ep->qp_info.qp->inbound_q);
+ /*
+ * Send a notification to the peer about the
+ * consumed data message only if the EP is in
+ * SCIFEP_CONNECTED state.
+ */
+ notif_msg.src = ep->port;
+ notif_msg.uop = SCIF_CLIENT_RCVD;
+ notif_msg.payload[0] = ep->remote_ep;
+ if ((ret = micscif_nodeqp_send(ep->remote_dev, ¬if_msg, ep))) {
+ ret = (len - (int)remaining_len) ?
+ (len - (int)remaining_len) : ret;
+ goto unlock_dec_return;
+ }
+ }
+ remaining_len -= curr_recv_len;
+ msg = (char *)msg + curr_recv_len;
+ continue;
+ }
+ curr_recv_len = min(remaining_len, (size_t)(ENDPT_QP_SIZE - 1));
+ /*
+ * Bail out now if the EP is in SCIFEP_DISCONNECTED state else
+ * we will keep looping forever.
+ */
+ if (ep->state == SCIFEP_DISCONNECTED) {
+ ret = (len - (int)remaining_len) ?
+ (len - (int)remaining_len) : -ECONNRESET;
+ goto unlock_dec_return;
+ }
+ /*
+ * Return in the Non Blocking case if there is no data
+ * to read in this iteration.
+ */
+ if (!(flags & SCIF_RECV_BLOCK)) {
+ ret = len - (int)remaining_len;
+ goto unlock_dec_return;
+ }
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ /*
+ * Wait for a message now in the Blocking case.
+ * or until other side disconnects.
+ */
+ if ((ret = wait_event_interruptible(ep->recvwq,
+ (SCIFEP_CONNECTED != ep->state) ||
+ (micscif_rb_count(&ep->qp_info.qp->inbound_q,
+ curr_recv_len) >= curr_recv_len) || (!scifdev_alive(ep))))) {
+ ret = (len - remaining_len) ?
+ (len - (int)remaining_len) : ret;
+ goto dec_return;
+ }
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ spin_lock_irqsave(&ep->lock, sflags);
+ }
+ ret = len;
+unlock_dec_return:
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+dec_return:
+ return ret;
+}
+
+
+/**
+ * scif_user_send() - Send data to connection queue
+ * @epd: The end point address returned from scif_open()
+ * @msg: Address to place data
+ * @len: Length to receive
+ * @flags: Syncronous or asynchronous access
+ *
+ * This function is called from the driver IOCTL entry point
+ * only and is a wrapper for _scif_send().
+ */
+int
+scif_user_send(scif_epd_t epd, void *msg, int len, int flags)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ int err = 0;
+ int sent_len = 0;
+ char *tmp;
+ int loop_len;
+ int chunk_len = min(len, (1 << (MAX_ORDER + PAGE_SHIFT - 1)));;
+ pr_debug("SCIFAPI send (U): ep %p %s\n", ep, scif_ep_states[ep->state]);
+
+ if (!len)
+ return 0;
+
+ if ((err = scif_msg_param_check(epd, len, flags)))
+ goto send_err;
+
+ if (!(tmp = kmalloc(chunk_len, GFP_KERNEL))) {
+ err = -ENOMEM;
+ goto send_err;
+ }
+ err = 0;
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ /*
+ * Grabbing the lock before breaking up the transfer in
+ * multiple chunks is required to ensure that messages do
+ * not get fragmented and reordered.
+ */
+ mutex_lock(&ep->sendlock);
+
+ while (sent_len != len) {
+ msg = (void *)((char *)msg + err);
+ loop_len = len - sent_len;
+ loop_len = min(chunk_len, loop_len);
+ if (copy_from_user(tmp, msg, loop_len)) {
+ err = -EFAULT;
+ goto send_free_err;
+ }
+ err = _scif_send(epd, (void *)tmp, loop_len, flags);
+ if (err < 0) {
+ goto send_free_err;
+ }
+ sent_len += err;
+ if (err !=loop_len) {
+ goto send_free_err;
+ }
+ }
+send_free_err:
+ mutex_unlock(&ep->sendlock);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ kfree(tmp);
+send_err:
+ return err < 0 ? err : sent_len;
+}
+
+/**
+ * scif_user_recv() - Recieve data from connection queue
+ * @epd: The end point address returned from scif_open()
+ * @msg: Address to place data
+ * @len: Length to receive
+ * @flags: Syncronous or asynchronous access
+ *
+ * This function is called from the driver IOCTL entry point
+ * only and is a wrapper for _scif_recv().
+ */
+int
+scif_user_recv(scif_epd_t epd, void *msg, int len, int flags)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ int err = 0;
+ int recv_len = 0;
+ char *tmp;
+ int loop_len;
+ int chunk_len = min(len, (1 << (MAX_ORDER + PAGE_SHIFT - 1)));;
+ pr_debug("SCIFAPI recv (U): ep %p %s\n", ep, scif_ep_states[ep->state]);
+
+ if (!len)
+ return 0;
+
+ if ((err = scif_msg_param_check(epd, len, flags)))
+ goto recv_err;
+
+ if (!(tmp = kmalloc(chunk_len, GFP_KERNEL))) {
+ err = -ENOMEM;
+ goto recv_err;
+ }
+ err = 0;
+ /*
+ * Grabbing the lock before breaking up the transfer in
+ * multiple chunks is required to ensure that messages do
+ * not get fragmented and reordered.
+ */
+ mutex_lock(&ep->recvlock);
+
+ while (recv_len != len) {
+ msg = (void *)((char *)msg + err);
+ loop_len = len - recv_len;
+ loop_len = min(chunk_len, loop_len);
+ if ((err = _scif_recv(epd, tmp, loop_len, flags)) < 0)
+ goto recv_free_err;
+ if (copy_to_user(msg, tmp, err)) {
+ err = -EFAULT;
+ goto recv_free_err;
+ }
+ recv_len += err;
+ if (err !=loop_len) {
+ goto recv_free_err;
+ }
+ }
+recv_free_err:
+ mutex_unlock(&ep->recvlock);
+ kfree(tmp);
+recv_err:
+ return err < 0 ? err : recv_len;
+}
+
+#ifdef SCIF_BLAST
+/*
+ * Added a temporary implementation of the exception path.
+ * The cost to the normal path testing of 2 flag bits instead
+ * of just one and a change to condition for node-wakeup.
+ */
+#endif
+
+/**
+ * scif_send() - Send data to connection queue
+ * @epd: The end point address returned from scif_open()
+ * @msg: Address to place data
+ * @len: Length to receive
+ * @flags: Syncronous or asynchronous access
+ *
+ * This function is called from the kernel mode only and is
+ * a wrapper for _scif_send().
+ */
+int
+__scif_send(scif_epd_t epd, void *msg, int len, int flags)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ int ret;
+
+ pr_debug("SCIFAPI send (K): ep %p %s\n", ep, scif_ep_states[ep->state]);
+ if (!len)
+ return 0;
+
+#ifdef SCIF_BLAST
+ /*
+ * KAA: this is same code as scif_msg_param_check(),
+ * but since that routine is shared with scif_recv
+ * I thought is safer to replicate code here.
+ */
+ if (len < 0)
+ return -EINVAL;
+
+ if (flags && !(flags & (SCIF_SEND_BLOCK | SCIF_BLAST)))
+ return -EINVAL;
+
+ if ((flags & (SCIF_SEND_BLOCK | SCIF_BLAST)) ==
+ (SCIF_SEND_BLOCK | SCIF_BLAST))
+ return -EINVAL;
+#else
+ if ((ret = scif_msg_param_check(epd, len, flags)))
+ return ret;
+#endif
+ /*
+ * Cannot block while waiting for node to wake up
+ * if non blocking messaging mode is requested. Return
+ * ENODEV if the remote node is idle.
+ */
+ if (!(flags & SCIF_SEND_BLOCK) && ep->remote_dev &&
+ SCIF_NODE_IDLE == atomic_long_read(
+ &ep->remote_dev->scif_ref_cnt))
+ return -ENODEV;
+
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+
+ /*
+ * Grab the mutex lock in the blocking case only
+ * to ensure messages do not get fragmented/reordered.
+ * The non blocking mode is protected using spin locks
+ * in _scif_send().
+ */
+ if (flags & SCIF_SEND_BLOCK)
+ mutex_lock(&ep->sendlock);
+
+ ret = _scif_send(epd, msg, len, flags);
+
+ if (flags & SCIF_SEND_BLOCK)
+ mutex_unlock(&ep->sendlock);
+
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ return ret;
+}
+
+int
+scif_send(scif_epd_t epd, void *msg, int len, int flags)
+{
+ int ret;
+ get_kref_count(epd);
+ ret = __scif_send(epd, msg, len, flags);
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_send);
+
+/**
+ * scif_recv() - Recieve data from connection queue
+ * @epd: The end point address returned from scif_open()
+ * @msg: Address to place data
+ * @len: Length to receive
+ * @flags: Syncronous or asynchronous access
+ *
+ * This function is called from the kernel mode only and is
+ * a wrapper for _scif_recv().
+ */
+int
+__scif_recv(scif_epd_t epd, void *msg, int len, int flags)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ int ret;
+
+ pr_debug("SCIFAPI recv (K): ep %p %s\n", ep, scif_ep_states[ep->state]);
+
+ if (!len)
+ return 0;
+
+ if ((ret = scif_msg_param_check(epd, len, flags)))
+ return ret;
+
+ /*
+ * Cannot block while waiting for node to wake up
+ * if non blocking messaging mode is requested. Return
+ * ENODEV if the remote node is idle.
+ */
+ if (!flags && ep->remote_dev &&
+ SCIF_NODE_IDLE == atomic_long_read(
+ &ep->remote_dev->scif_ref_cnt))
+ return -ENODEV;
+
+ /*
+ * Grab the mutex lock in the blocking case only
+ * to ensure messages do not get fragmented/reordered.
+ * The non blocking mode is protected using spin locks
+ * in _scif_send().
+ */
+ if (flags & SCIF_RECV_BLOCK)
+ mutex_lock(&ep->recvlock);
+
+ ret = _scif_recv(epd, msg, len, flags);
+
+ if (flags & SCIF_RECV_BLOCK)
+ mutex_unlock(&ep->recvlock);
+
+ return ret;
+}
+
+int
+scif_recv(scif_epd_t epd, void *msg, int len, int flags)
+{
+ int ret;
+ get_kref_count(epd);
+ ret = __scif_recv(epd, msg, len, flags);
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_recv);
+
+/**
+ * __scif_pin_pages - __scif_pin_pages() pins the physical pages which back
+ * the range of virtual address pages starting at addr and continuing for
+ * len bytes. addr and len are constrained to be multiples of the page size.
+ * A successful scif_register() call returns an opaque pointer value
+ * which may be used in subsequent calls to scif_register_pinned_pages().
+ *
+ * Return Values
+ * Upon successful completion, __scif_pin_pages() returns a
+ * scif_pinned_pages_t value else an apt error is returned as documented
+ * in scif.h. Protections of the set of pinned pages are also returned by
+ * reference via out_prot.
+ */
+int
+__scif_pin_pages(void *addr, size_t len, int *out_prot,
+ int map_flags, scif_pinned_pages_t *pages)
+{
+ struct scif_pinned_pages *pinned_pages;
+ int nr_pages, err = 0, i;
+ bool vmalloc_addr = false;
+ bool try_upgrade = false;
+ int prot = *out_prot;
+ int ulimit = 0;
+ struct mm_struct *mm = NULL;
+
+ /* Unsupported flags */
+ if (map_flags & ~(SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT))
+ return -EINVAL;
+ ulimit = !!(map_flags & SCIF_MAP_ULIMIT);
+
+ /* Unsupported protection requested */
+ if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE))
+ return -EINVAL;
+
+ /* addr/len must be page aligned. len should be non zero */
+ if ((!len) ||
+ (align_low((uint64_t)addr, PAGE_SIZE) != (uint64_t)addr) ||
+ (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len))
+ return -EINVAL;
+
+ might_sleep();
+
+ nr_pages = (int)(len >> PAGE_SHIFT);
+
+ /* Allocate a set of pinned pages */
+ if (!(pinned_pages = micscif_create_pinned_pages(nr_pages, prot)))
+ return -ENOMEM;
+
+ if (unlikely(map_flags & SCIF_MAP_KERNEL)) {
+ if (is_vmalloc_addr(addr))
+ vmalloc_addr = true;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (unlikely(vmalloc_addr))
+ pinned_pages->pages[i] =
+ vmalloc_to_page((char *)addr + (i * PAGE_SIZE) );
+ else
+ pinned_pages->pages[i] =
+ virt_to_page((char *)addr + (i * PAGE_SIZE) );
+ pinned_pages->num_pages[i] = 1;
+ pinned_pages->nr_contig_chunks++;
+ }
+ pinned_pages->nr_pages = nr_pages;
+ pinned_pages->map_flags = SCIF_MAP_KERNEL;
+ } else {
+ if (prot == SCIF_PROT_READ)
+ try_upgrade = true;
+ prot |= SCIF_PROT_WRITE;
+retry:
+ mm = current->mm;
+ down_write(&mm->mmap_sem);
+ if (ulimit) {
+ err = __scif_check_inc_pinned_vm(mm, nr_pages);
+ if (err) {
+ up_write(&mm->mmap_sem);
+ pinned_pages->nr_pages = 0;
+ goto error_unmap;
+ }
+ }
+
+ pinned_pages->nr_pages = get_user_pages(
+ current,
+ mm,
+ (uint64_t)addr,
+ nr_pages,
+ !!(prot & SCIF_PROT_WRITE),
+ 0,
+ pinned_pages->pages,
+ pinned_pages->vma);
+ up_write(&mm->mmap_sem);
+ if (nr_pages == pinned_pages->nr_pages) {
+#ifdef RMA_DEBUG
+ atomic_long_add_return(nr_pages, &ms_info.rma_pin_cnt);
+#endif
+ micscif_detect_large_page(pinned_pages, addr);
+ } else {
+ if (try_upgrade) {
+ if (ulimit)
+ __scif_dec_pinned_vm_lock(mm, nr_pages, 0);
+#ifdef RMA_DEBUG
+ WARN_ON(atomic_long_sub_return(1,
+ &ms_info.rma_mm_cnt) < 0);
+#endif
+ /* Roll back any pinned pages */
+ for (i = 0; i < pinned_pages->nr_pages; i++) {
+ if (pinned_pages->pages[i])
+ page_cache_release(pinned_pages->pages[i]);
+ }
+ prot &= ~SCIF_PROT_WRITE;
+ try_upgrade = false;
+ goto retry;
+ }
+ }
+ pinned_pages->map_flags = 0;
+ }
+
+ if (pinned_pages->nr_pages < nr_pages) {
+ err = -EFAULT;
+ pinned_pages->nr_pages = nr_pages;
+ goto dec_pinned;
+ }
+
+ *out_prot = prot;
+ atomic_set(&pinned_pages->ref_count, nr_pages);
+ *pages = pinned_pages;
+ return err;
+dec_pinned:
+ if (ulimit)
+ __scif_dec_pinned_vm_lock(mm, nr_pages, 0);
+ /* Something went wrong! Rollback */
+error_unmap:
+ pinned_pages->nr_pages = nr_pages;
+ micscif_destroy_pinned_pages(pinned_pages);
+ *pages = NULL;
+ pr_debug("%s %d err %d len 0x%lx\n", __func__, __LINE__, err, len);
+ return err;
+
+}
+
+/**
+ * scif_pin_pages - scif_pin_pages() pins the physical pages which back
+ * the range of virtual address pages starting at addr and continuing for
+ * len bytes. addr and len are constrained to be multiples of the page size.
+ * A successful scif_register() call returns an opaque pointer value
+ * which may be used in subsequent calls to scif_register_pinned_pages().
+ *
+ * Return Values
+ * Upon successful completion, scif_register() returns a
+ * scif_pinned_pages_t value else an apt error is returned as documented
+ * in scif.h
+ */
+int
+scif_pin_pages(void *addr, size_t len, int prot,
+ int map_flags, scif_pinned_pages_t *pages)
+{
+ return __scif_pin_pages(addr, len, &prot, map_flags, pages);
+}
+EXPORT_SYMBOL(scif_pin_pages);
+
+/**
+ * scif_unpin_pages: Unpin a set of pages
+ *
+ * Return Values:
+ * Upon successful completion, scif_unpin_pages() returns 0;
+ * else an apt error is returned as documented in scif.h
+ */
+int
+scif_unpin_pages(scif_pinned_pages_t pinned_pages)
+{
+ int err = 0, ret;
+
+ if (!pinned_pages || SCIFEP_MAGIC != pinned_pages->magic)
+ return -EINVAL;
+
+ ret = atomic_sub_return((int32_t)pinned_pages->nr_pages,
+ &pinned_pages->ref_count);
+ BUG_ON(ret < 0);
+
+ /*
+ * Destroy the window if the ref count for this set of pinned
+ * pages has dropped to zero. If it is positive then there is
+ * a valid registered window which is backed by these pages and
+ * it will be destroyed once all such windows are unregistered.
+ */
+ if (!ret)
+ err = micscif_destroy_pinned_pages(pinned_pages);
+
+ return err;
+}
+EXPORT_SYMBOL(scif_unpin_pages);
+
+/**
+ * scif_register_pinned_pages: Mark a memory region for remote access.
+ *
+ * The scif_register_pinned_pages() function opens a window, a range
+ * of whole pages of the registered address space of the endpoint epd,
+ * starting at offset po. The value of po, further described below, is
+ * a function of the parameters offset and pinned_pages, and the value
+ * of map_flags. Each page of the window represents a corresponding
+ * physical memory page of pinned_pages; the length of the window is
+ * the same as the length of pinned_pages. A successful scif_register()
+ * call returns po as the return value.
+ *
+ * Return Values
+ * Upon successful completion, scif_register_pinned_pages() returns
+ * the offset at which the mapping was placed (po);
+ * else an apt error is returned as documented in scif.h
+ */
+off_t
+__scif_register_pinned_pages(scif_epd_t epd,
+ scif_pinned_pages_t pinned_pages, off_t offset, int map_flags)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ uint64_t computed_offset;
+ struct reg_range_t *window;
+ int err;
+ size_t len;
+
+#ifdef DEBUG
+ /* Bad EP */
+ if (!ep || !pinned_pages || pinned_pages->magic != SCIFEP_MAGIC)
+ return -EINVAL;
+#endif
+ /* Unsupported flags */
+ if (map_flags & ~SCIF_MAP_FIXED)
+ return -EINVAL;
+
+ len = pinned_pages->nr_pages << PAGE_SHIFT;
+
+ /*
+ * Offset is not page aligned/negative or offset+len
+ * wraps around with SCIF_MAP_FIXED.
+ */
+ if ((map_flags & SCIF_MAP_FIXED) &&
+ ((align_low(offset, PAGE_SIZE) != offset) ||
+ (offset < 0) ||
+ (offset + (off_t)len < offset)))
+ return -EINVAL;
+
+ might_sleep();
+
+ if ((err = verify_epd(ep)))
+ return err;
+
+ /* Compute the offset for this registration */
+ if ((err = micscif_get_window_offset(ep, map_flags, offset,
+ len, &computed_offset)))
+ return err;
+
+ /* Allocate and prepare self registration window */
+ if (!(window = micscif_create_window(ep, pinned_pages->nr_pages,
+ computed_offset, false))) {
+ micscif_free_window_offset(ep, computed_offset, len);
+ return -ENOMEM;
+ }
+
+ window->pinned_pages = pinned_pages;
+ window->nr_pages = pinned_pages->nr_pages;
+ window->nr_contig_chunks = pinned_pages->nr_contig_chunks;
+ window->prot = pinned_pages->prot;
+
+ /*
+ * This set of pinned pages now belongs to this window as well.
+ * Assert if the ref count is zero since it is an error to
+ * pass pinned_pages to scif_register_pinned_pages() after
+ * calling scif_unpin_pages().
+ */
+ if (!atomic_add_unless(&pinned_pages->ref_count,
+ (int32_t)pinned_pages->nr_pages, 0))
+ BUG_ON(1);
+
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+
+ if ((err = micscif_send_alloc_request(ep, window))) {
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ goto error_unmap;
+ }
+
+ /* Prepare the remote registration window */
+ if ((err = micscif_prep_remote_window(ep, window))) {
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ micscif_set_nr_pages(ep->remote_dev, window);
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ goto error_unmap;
+ }
+
+ /* Tell the peer about the new window */
+ if ((err = micscif_send_scif_register(ep, window))) {
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ goto error_unmap;
+ }
+
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+
+ /* No further failures expected. Insert new window */
+ mutex_lock(&ep->rma_info.rma_lock);
+ set_window_ref_count(window, pinned_pages->nr_pages);
+ micscif_insert_window(window, &ep->rma_info.reg_list);
+ mutex_unlock(&ep->rma_info.rma_lock);
+
+ return computed_offset;
+error_unmap:
+ micscif_destroy_window(ep, window);
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ return err;
+}
+
+off_t
+scif_register_pinned_pages(scif_epd_t epd,
+ scif_pinned_pages_t pinned_pages, off_t offset, int map_flags)
+{
+ off_t ret;
+ get_kref_count(epd);
+ ret = __scif_register_pinned_pages(epd, pinned_pages, offset, map_flags);
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_register_pinned_pages);
+
+/**
+ * scif_get_pages - Add references to remote registered pages
+ *
+ * scif_get_pages() returns the addresses of the physical pages represented
+ * by those pages of the registered address space of the peer of epd, starting
+ * at offset offset and continuing for len bytes. offset and len are constrained
+ * to be multiples of the page size.
+ *
+ * Return Values
+ * Upon successful completion, scif_get_pages() returns 0;
+ * else an apt error is returned as documented in scif.h.
+ */
+int
+__scif_get_pages(scif_epd_t epd, off_t offset, size_t len, struct scif_range **pages)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ struct micscif_rma_req req;
+ struct reg_range_t *window = NULL;
+ int nr_pages, err, i;
+
+ pr_debug("SCIFAPI get_pinned_pages: ep %p %s offset 0x%lx len 0x%lx\n",
+ ep, scif_ep_states[ep->state], offset, len);
+
+ if ((err = verify_epd(ep)))
+ return err;
+
+ if ((!len) ||
+ (offset < 0) ||
+ (offset + len < offset) ||
+ (align_low((uint64_t)offset, PAGE_SIZE) != (uint64_t)offset) ||
+ (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len))
+ return -EINVAL;
+
+ nr_pages = len >> PAGE_SHIFT;
+
+ req.out_window = &window;
+ req.offset = offset;
+ req.prot = 0;
+ req.nr_bytes = len;
+ req.type = WINDOW_SINGLE;
+ req.head = &ep->rma_info.remote_reg_list;
+
+ mutex_lock(&ep->rma_info.rma_lock);
+ /* Does a valid window exist? */
+ if ((err = micscif_query_window(&req))) {
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ goto error;
+ }
+ RMA_MAGIC(window);
+
+ /* Allocate scif_range */
+ if (!(*pages = kzalloc(sizeof(struct scif_range), GFP_KERNEL))) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ /* Allocate phys addr array */
+ if (!((*pages)->phys_addr = scif_zalloc(nr_pages * sizeof(dma_addr_t)))) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+#ifndef _MIC_SCIF_
+ /* Allocate virtual address array */
+ if (!((*pages)->va = scif_zalloc(nr_pages * sizeof(void *)))) {
+ err = -ENOMEM;
+ goto error;
+ }
+#endif
+ /* Populate the values */
+ (*pages)->cookie = window;
+ (*pages)->nr_pages = nr_pages;
+ (*pages)->prot_flags = window->prot;
+
+ for (i = 0; i < nr_pages; i++) {
+ (*pages)->phys_addr[i] =
+#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
+ is_self_scifdev(ep->remote_dev) ?
+ micscif_get_dma_addr(window, offset + (i * PAGE_SIZE),
+ NULL, NULL, NULL) : window->phys_addr[i];
+#else
+ get_phys_addr(micscif_get_dma_addr(window, offset + (i * PAGE_SIZE),
+ NULL, NULL, NULL), ep->remote_dev);
+#endif
+#ifndef _MIC_SCIF_
+ if (!is_self_scifdev(ep->remote_dev))
+ (*pages)->va[i] =
+ get_per_dev_ctx(ep->remote_dev->sd_node - 1)->aper.va +
+ (*pages)->phys_addr[i] -
+ get_per_dev_ctx(ep->remote_dev->sd_node - 1)->aper.pa;
+#endif
+ }
+
+ window->get_put_ref_count += nr_pages;
+ get_window_ref_count(window, nr_pages);
+error:
+ mutex_unlock(&ep->rma_info.rma_lock);
+ if (err) {
+ if (*pages) {
+ if ((*pages)->phys_addr)
+ scif_free((*pages)->phys_addr, nr_pages * sizeof(dma_addr_t));
+#ifndef _MIC_SCIF_
+ if ((*pages)->va)
+ scif_free((*pages)->va, nr_pages * sizeof(void *));
+#endif
+ kfree(*pages);
+ *pages = NULL;
+ }
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ } else {
+ micscif_create_node_dep(ep->remote_dev, nr_pages);
+ }
+ return err;
+}
+
+int
+scif_get_pages(scif_epd_t epd, off_t offset, size_t len, struct scif_range **pages)
+{
+ int ret;
+ get_kref_count(epd);
+ ret = __scif_get_pages(epd, offset, len, pages);
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_get_pages);
+
+/**
+ * scif_put_pages - Remove references from remote registered pages
+ *
+ * scif_put_pages() returns a scif_range structure previously obtained by
+ * calling scif_get_pages(). When control returns, the physical pages may
+ * become available for reuse if and when the window which represented
+ * those pages is unregistered. Therefore, those pages must never be accessed.
+ *
+ * Return Values
+ * Upon success, zero is returned.
+ * else an apt error is returned as documented in scif.h.
+ */
+int
+__scif_put_pages(struct scif_range *pages)
+{
+ struct endpt *ep;
+ struct reg_range_t *window;
+ struct nodemsg msg;
+
+ if (!pages || !pages->cookie)
+ return -EINVAL;
+
+ window = pages->cookie;
+
+ if (!window || window->magic != SCIFEP_MAGIC ||
+ !window->get_put_ref_count)
+ return -EINVAL;
+
+ ep = (struct endpt *)window->ep;
+
+ /*
+ * If the state is SCIFEP_CONNECTED or SCIFEP_DISCONNECTED then the
+ * callee should be allowed to release references to the pages,
+ * else the endpoint was not connected in the first place,
+ * hence the ENOTCONN.
+ */
+ if (ep->state != SCIFEP_CONNECTED && ep->state != SCIFEP_DISCONNECTED)
+ return -ENOTCONN;
+
+ /*
+ * TODO: Re-enable this check once ref counts for kernel mode APIs
+ * have been implemented and node remove call backs are called before
+ * the node is removed. This check results in kernel mode APIs not
+ * being able to release pages correctly since node remove callbacks
+ * are called after the node is removed currently.
+ * if (!scifdev_alive(ep))
+ * return -ENODEV;
+ */
+
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ mutex_lock(&ep->rma_info.rma_lock);
+
+ /* Decrement the ref counts and check for errors */
+ window->get_put_ref_count -= pages->nr_pages;
+ BUG_ON(window->get_put_ref_count < 0);
+ put_window_ref_count(window, pages->nr_pages);
+
+ /* Initiate window destruction if ref count is zero */
+ if (!window->ref_count) {
+ drain_dma_intr(ep->rma_info.dma_chan);
+ /* Inform the peer about this window being destroyed. */
+ msg.uop = SCIF_MUNMAP;
+ msg.src = ep->port;
+ msg.payload[0] = window->peer_window;
+ /* No error handling for notification messages */
+ micscif_nodeqp_send(ep->remote_dev, &msg, ep);
+ list_del(&window->list_member);
+ /* Destroy this window from the peer's registered AS */
+ micscif_destroy_remote_window(ep, window);
+ }
+ mutex_unlock(&ep->rma_info.rma_lock);
+
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ micscif_destroy_node_dep(ep->remote_dev, pages->nr_pages);
+ scif_free(pages->phys_addr, pages->nr_pages * sizeof(dma_addr_t));
+#ifndef _MIC_SCIF_
+ scif_free(pages->va, pages->nr_pages * sizeof(void*));
+#endif
+ kfree(pages);
+ return 0;
+}
+
+int
+scif_put_pages(struct scif_range *pages)
+{
+ int ret;
+ struct reg_range_t *window = pages->cookie;
+ struct endpt *ep = (struct endpt *)window->ep;
+ if (atomic_read(&(&(ep->ref_count))->refcount) > 0) {
+ kref_get(&(ep->ref_count));
+ } else {
+ WARN_ON(1);
+ }
+ ret = __scif_put_pages(pages);
+ if (atomic_read(&(&(ep->ref_count))->refcount) > 0) {
+ kref_put(&(ep->ref_count), scif_ref_rel);
+ } else {
+ //WARN_ON(1);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(scif_put_pages);
+
+int scif_event_register(scif_callback_t handler)
+{
+ /* Add to the list of event handlers */
+ struct scif_callback *cb = kmalloc(sizeof(*cb), GFP_KERNEL);
+ if (!cb)
+ return -ENOMEM;
+ mutex_lock(&ms_info.mi_event_cblock);
+ cb->callback_handler = handler;
+ list_add_tail(&cb->list_member, &ms_info.mi_event_cb);
+ mutex_unlock(&ms_info.mi_event_cblock);
+ return 0;
+}
+EXPORT_SYMBOL(scif_event_register);
+
+int scif_event_unregister(scif_callback_t handler)
+{
+ struct list_head *pos, *unused;
+ struct scif_callback *temp;
+ int err = -EINVAL;
+
+ mutex_lock(&ms_info.mi_event_cblock);
+ list_for_each_safe(pos, unused, &ms_info.mi_event_cb) {
+ temp = list_entry(pos, struct scif_callback, list_member);
+ if (temp->callback_handler == handler) {
+ err = 0;
+ list_del(pos);
+ kfree(temp);
+ break;
+ }
+ }
+
+ mutex_unlock(&ms_info.mi_event_cblock);
+ return err;
+}
+EXPORT_SYMBOL(scif_event_unregister);
+
+/**
+ * scif_register - Mark a memory region for remote access.
+ * @epd: endpoint descriptor
+ * @addr: starting virtual address
+ * @len: length of range
+ * @offset: offset of window
+ * @prot: read/write protection
+ * @map_flags: flags
+ *
+ * Return Values
+ * Upon successful completion, scif_register() returns the offset
+ * at which the mapping was placed else an apt error is returned
+ * as documented in scif.h.
+ */
+off_t
+__scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
+ int prot, int map_flags)
+{
+ scif_pinned_pages_t pinned_pages;
+ off_t err;
+ struct endpt *ep = (struct endpt *)epd;
+ uint64_t computed_offset;
+ struct reg_range_t *window;
+ struct mm_struct *mm = NULL;
+
+ pr_debug("SCIFAPI register: ep %p %s addr %p len 0x%lx"
+ " offset 0x%lx prot 0x%x map_flags 0x%x\n",
+ epd, scif_ep_states[epd->state], addr, len, offset, prot, map_flags);
+
+ /* Unsupported flags */
+ if (map_flags & ~(SCIF_MAP_FIXED | SCIF_MAP_KERNEL))
+ return -EINVAL;
+
+ /* Unsupported protection requested */
+ if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE))
+ return -EINVAL;
+
+ /* addr/len must be page aligned. len should be non zero */
+ if ((!len) ||
+ (align_low((uint64_t)addr, PAGE_SIZE) != (uint64_t)addr) ||
+ (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len))
+ return -EINVAL;
+
+ /*
+ * Offset is not page aligned/negative or offset+len
+ * wraps around with SCIF_MAP_FIXED.
+ */
+ if ((map_flags & SCIF_MAP_FIXED) &&
+ ((align_low(offset, PAGE_SIZE) != offset) ||
+ (offset < 0) ||
+ (offset + (off_t)len < offset)))
+ return -EINVAL;
+
+
+ might_sleep();
+
+#ifdef DEBUG
+ /* Bad EP */
+ if (!ep)
+ return -EINVAL;
+#endif
+
+ if ((err = verify_epd(ep)))
+ return err;
+
+ /* Compute the offset for this registration */
+ if ((err = micscif_get_window_offset(ep, map_flags, offset,
+ len, &computed_offset)))
+ return err;
+
+ /* Allocate and prepare self registration window */
+ if (!(window = micscif_create_window(ep, len >> PAGE_SHIFT,
+ computed_offset, false))) {
+ micscif_free_window_offset(ep, computed_offset, len);
+ return -ENOMEM;
+ }
+
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+
+ window->nr_pages = len >> PAGE_SHIFT;
+
+ if ((err = micscif_send_alloc_request(ep, window))) {
+ micscif_destroy_incomplete_window(ep, window);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ return err;
+ }
+
+ if (!(map_flags & SCIF_MAP_KERNEL)) {
+ mm = __scif_acquire_mm();
+ map_flags |= SCIF_MAP_ULIMIT;
+ }
+ /* Pin down the pages */
+ if ((err = scif_pin_pages(addr, len, prot,
+ map_flags & (SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT),
+ &pinned_pages))) {
+ micscif_destroy_incomplete_window(ep, window);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ __scif_release_mm(mm);
+ goto error;
+ }
+
+ window->pinned_pages = pinned_pages;
+ window->nr_contig_chunks = pinned_pages->nr_contig_chunks;
+ window->prot = pinned_pages->prot;
+ window->mm = mm;
+
+ /* Prepare the remote registration window */
+ if ((err = micscif_prep_remote_window(ep, window))) {
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ micscif_set_nr_pages(ep->remote_dev, window);
+ printk(KERN_ERR "%s %d err %ld\n", __func__, __LINE__, err);
+ goto error_unmap;
+ }
+
+ /* Tell the peer about the new window */
+ if ((err = micscif_send_scif_register(ep, window))) {
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ printk(KERN_ERR "%s %d err %ld\n", __func__, __LINE__, err);
+ goto error_unmap;
+ }
+
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+
+ /* No further failures expected. Insert new window */
+ mutex_lock(&ep->rma_info.rma_lock);
+ set_window_ref_count(window, pinned_pages->nr_pages);
+ micscif_insert_window(window, &ep->rma_info.reg_list);
+ mutex_unlock(&ep->rma_info.rma_lock);
+
+ pr_debug("SCIFAPI register: ep %p %s addr %p"
+ " len 0x%lx computed_offset 0x%llx\n",
+ epd, scif_ep_states[epd->state], addr, len, computed_offset);
+ return computed_offset;
+error_unmap:
+ micscif_destroy_window(ep, window);
+error:
+ printk(KERN_ERR "%s %d err %ld\n", __func__, __LINE__, err);
+ return err;
+}
+
+off_t
+scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
+ int prot, int map_flags)
+{
+ off_t ret;
+ get_kref_count(epd);
+ ret = __scif_register(epd, addr, len, offset, prot, map_flags);
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_register);
+
+/**
+ * scif_unregister - Release a memory region registered for remote access.
+ * @epd: endpoint descriptor
+ * @offset: start of range to unregister
+ * @len: length of range to unregister
+ *
+ * Return Values
+ * Upon successful completion, scif_unegister() returns zero
+ * else an apt error is returned as documented in scif.h.
+ */
+int
+__scif_unregister(scif_epd_t epd, off_t offset, size_t len)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ struct reg_range_t *window = NULL;
+ struct micscif_rma_req req;
+ int nr_pages, err;
+
+ pr_debug("SCIFAPI unregister: ep %p %s offset 0x%lx len 0x%lx\n",
+ ep, scif_ep_states[ep->state], offset, len);
+
+ /* len must be page aligned. len should be non zero */
+ if ((!len) ||
+ (align_low((uint64_t)len, PAGE_SIZE) != (uint64_t)len))
+ return -EINVAL;
+
+ /* Offset is not page aligned or offset+len wraps around */
+ if ((align_low(offset, PAGE_SIZE) != offset) ||
+ (offset + (off_t)len < offset))
+ return -EINVAL;
+
+ if ((err = verify_epd(ep)))
+ return err;
+
+ might_sleep();
+ nr_pages = (int)(len >> PAGE_SHIFT);
+
+ req.out_window = &window;
+ req.offset = offset;
+ req.prot = 0;
+ req.nr_bytes = len;
+ req.type = WINDOW_FULL;
+ req.head = &ep->rma_info.reg_list;
+
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ mutex_lock(&ep->rma_info.rma_lock);
+ /* Does a valid window exist? */
+ if ((err = micscif_query_window(&req))) {
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ goto error;
+ }
+ /* Unregister all the windows in this range */
+ if ((err = micscif_rma_list_unregister(window, offset, nr_pages)))
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+error:
+ mutex_unlock(&ep->rma_info.rma_lock);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ return err;
+}
+
+int
+scif_unregister(scif_epd_t epd, off_t offset, size_t len)
+{
+ int ret;
+ get_kref_count(epd);
+ ret = __scif_unregister(epd, offset, len);
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_unregister);
+
+unsigned int scif_pollfd(struct file *f, poll_table *wait, scif_epd_t epd)
+{
+ unsigned int ret;
+ get_kref_count(epd);
+ ret = __scif_pollfd(f, wait, (struct endpt *)epd);
+ put_kref_count(epd);
+ return ret;
+}
+
+unsigned int __scif_pollfd(struct file *f, poll_table *wait, struct endpt *ep)
+{
+ unsigned int mask = 0;
+ unsigned long sflags;
+
+ pr_debug("SCIFAPI pollfd: ep %p %s\n", ep, scif_ep_states[ep->state]);
+
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ spin_lock_irqsave(&ep->lock, sflags);
+
+ if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) {
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ if (!wait || poll_requested_events(wait) & SCIF_POLLOUT) {
+#else
+ if (!wait || wait->key & SCIF_POLLOUT) {
+#endif
+ poll_wait(f, &ep->conn_pend_wq, wait);
+ if (ep->state == SCIFEP_CONNECTED ||
+ ep->state == SCIFEP_DISCONNECTED ||
+ ep->conn_err) {
+ mask |= SCIF_POLLOUT;
+ }
+ goto return_scif_poll;
+ }
+ }
+
+ /* Is it OK to use wait->key?? */
+ if (ep->state == SCIFEP_LISTENING) {
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ if (!wait || poll_requested_events(wait) & SCIF_POLLIN) {
+#else
+ if (!wait || wait->key & SCIF_POLLIN) {
+#endif
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ poll_wait(f, &ep->conwq, wait);
+ spin_lock_irqsave(&ep->lock, sflags);
+ if (ep->conreqcnt)
+ mask |= SCIF_POLLIN;
+ } else {
+ mask |= SCIF_POLLERR;
+ }
+ goto return_scif_poll;
+ }
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ if (!wait || poll_requested_events(wait) & SCIF_POLLIN) {
+#else
+ if (!wait || wait->key & SCIF_POLLIN) {
+#endif
+ if (ep->state != SCIFEP_CONNECTED &&
+ ep->state != SCIFEP_LISTENING &&
+ ep->state != SCIFEP_DISCONNECTED) {
+ mask |= SCIF_POLLERR;
+ goto return_scif_poll;
+ }
+
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ poll_wait(f, &ep->recvwq, wait);
+ spin_lock_irqsave(&ep->lock, sflags);
+ if (micscif_rb_count(&ep->qp_info.qp->inbound_q, 1))
+ mask |= SCIF_POLLIN;
+ }
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ if (!wait || poll_requested_events(wait) & SCIF_POLLOUT) {
+#else
+ if (!wait || wait->key & SCIF_POLLOUT) {
+#endif
+ if (ep->state != SCIFEP_CONNECTED &&
+ ep->state != SCIFEP_LISTENING) {
+ mask |= SCIF_POLLERR;
+ goto return_scif_poll;
+ }
+
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ poll_wait(f, &ep->sendwq, wait);
+ spin_lock_irqsave(&ep->lock, sflags);
+ if (micscif_rb_space(&ep->qp_info.qp->outbound_q))
+ mask |= SCIF_POLLOUT;
+ }
+
+return_scif_poll:
+ /* If the endpoint is in the diconnected state then return hangup instead of error */
+ if (ep->state == SCIFEP_DISCONNECTED) {
+ mask &= ~SCIF_POLLERR;
+ mask |= SCIF_POLLHUP;
+ }
+
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ return mask;
+}
+
+/*
+ * The private data field of each VMA used to mmap a remote window
+ * points to an instance of struct vma_pvt
+ */
+struct vma_pvt {
+ struct endpt *ep; /* End point for remote window */
+ uint64_t offset; /* offset within remote window */
+ bool valid_offset; /* offset is valid only if the original
+ * mmap request was for a single page
+ * else the offset within the vma is
+ * the correct offset
+ */
+ struct kref ref;
+};
+
+static void vma_pvt_release(struct kref *ref)
+{
+ struct vma_pvt *vmapvt = container_of(ref, struct vma_pvt, ref);
+ kfree(vmapvt);
+}
+
+/**
+ * scif_vma_open - VMA open driver callback
+ * @vma: VMM memory area.
+ * The open method is called by the kernel to allow the subsystem implementing
+ * the VMA to initialize the area. This method is invoked any time a new
+ * reference to the VMA is made (when a process forks, for example).
+ * The one exception happens when the VMA is first created by mmap;
+ * in this case, the driver's mmap method is called instead.
+ * This function is also invoked when an existing VMA is split by the kernel
+ * due to a call to munmap on a subset of the VMA resulting in two VMAs.
+ * The kernel invokes this function only on one of the two VMAs.
+ *
+ * Return Values: None.
+ */
+static void scif_vma_open(struct vm_area_struct *vma)
+{
+ struct vma_pvt *vmapvt = ((vma)->vm_private_data);
+ pr_debug("SCIFAPI vma open: vma_start 0x%lx vma_end 0x%lx\n",
+ ((vma)->vm_start), ((vma)->vm_end));
+ kref_get(&vmapvt->ref);
+}
+
+/**
+ * scif_munmap - VMA close driver callback.
+ * @vma: VMM memory area.
+ * When an area is destroyed, the kernel calls its close operation.
+ * Note that there's no usage count associated with VMA's; the area
+ * is opened and closed exactly once by each process that uses it.
+ *
+ * Return Values: None.
+ */
+void scif_munmap(struct vm_area_struct *vma)
+{
+ struct endpt *ep;
+ struct vma_pvt *vmapvt = ((vma)->vm_private_data);
+ int nr_pages = (int)( (((vma)->vm_end) - ((vma)->vm_start)) >> PAGE_SHIFT );
+ uint64_t offset;
+ struct micscif_rma_req req;
+ struct reg_range_t *window = NULL;
+ int err;
+
+ might_sleep();
+ pr_debug("SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n",
+ ((vma)->vm_start), ((vma)->vm_end));
+ /* used to be a BUG_ON(), prefer keeping the kernel alive */
+ if (!vmapvt) {
+ WARN_ON(1);
+ printk(KERN_ERR "SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n",
+ ((vma)->vm_start), ((vma)->vm_end));
+ return;
+ }
+
+ ep = vmapvt->ep;
+ offset = vmapvt->valid_offset ? vmapvt->offset :
+ ((vma)->vm_pgoff) << PAGE_SHIFT;
+ pr_debug("SCIFAPI munmap: ep %p %s nr_pages 0x%x offset 0x%llx\n",
+ ep, scif_ep_states[ep->state], nr_pages, offset);
+
+ req.out_window = &window;
+ req.offset = offset;
+ req.nr_bytes = ((vma)->vm_end) - ((vma)->vm_start);
+ req.prot = ((vma)->vm_flags) & (VM_READ | VM_WRITE);
+ req.type = WINDOW_PARTIAL;
+ req.head = &ep->rma_info.remote_reg_list;
+
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ mutex_lock(&ep->rma_info.rma_lock);
+
+ if ((err = micscif_query_window(&req)))
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ else
+ micscif_rma_list_munmap(window, offset, nr_pages);
+
+ mutex_unlock(&ep->rma_info.rma_lock);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+
+ micscif_destroy_node_dep(ep->remote_dev, nr_pages);
+
+ /*
+ * The kernel probably zeroes these out but we still want
+ * to clean up our own mess just in case.
+ */
+ vma->vm_ops = NULL;
+ ((vma)->vm_private_data) = NULL;
+ kref_put(&vmapvt->ref, vma_pvt_release);
+ micscif_rma_put_task(ep, nr_pages);
+}
+
+static const struct vm_operations_struct micscif_vm_ops = {
+ .open = scif_vma_open,
+ .close = scif_munmap,
+};
+
+/**
+ * scif_mmap - Map pages in virtual address space to a remote window.
+ * @vma: VMM memory area.
+ * @epd: endpoint descriptor
+ *
+ * Return Values
+ * Upon successful completion, scif_mmap() returns zero
+ * else an apt error is returned as documented in scif.h.
+ */
+int
+scif_mmap(struct vm_area_struct *vma, scif_epd_t epd)
+{
+ struct micscif_rma_req req;
+ struct reg_range_t *window = NULL;
+ struct endpt *ep = (struct endpt *)epd;
+ uint64_t start_offset = ((vma)->vm_pgoff) << PAGE_SHIFT;
+ int nr_pages = (int)( (((vma)->vm_end) - ((vma)->vm_start)) >> PAGE_SHIFT);
+ int err;
+ struct vma_pvt *vmapvt;
+
+ pr_debug("SCIFAPI mmap: ep %p %s start_offset 0x%llx nr_pages 0x%x\n",
+ ep, scif_ep_states[ep->state], start_offset, nr_pages);
+
+ if ((err = verify_epd(ep)))
+ return err;
+
+ might_sleep();
+
+ if ((err = micscif_rma_get_task(ep, nr_pages)))
+ return err;
+
+ if (!(vmapvt = kzalloc(sizeof(*vmapvt), GFP_KERNEL))) {
+ micscif_rma_put_task(ep, nr_pages);
+ return -ENOMEM;
+ }
+
+ vmapvt->ep = ep;
+ kref_init(&vmapvt->ref);
+
+ micscif_create_node_dep(ep->remote_dev, nr_pages);
+
+ req.out_window = &window;
+ req.offset = start_offset;
+ req.nr_bytes = ((vma)->vm_end) - ((vma)->vm_start);
+ req.prot = ((vma)->vm_flags) & (VM_READ | VM_WRITE);
+ req.type = WINDOW_PARTIAL;
+ req.head = &ep->rma_info.remote_reg_list;
+
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ mutex_lock(&ep->rma_info.rma_lock);
+ /* Does a valid window exist? */
+ if ((err = micscif_query_window(&req))) {
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ goto error;
+ }
+ RMA_MAGIC(window);
+
+ /* Default prot for loopback */
+ if (!is_self_scifdev(ep->remote_dev)) {
+#ifdef _MIC_SCIF_
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+#else
+ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+#endif
+ }
+
+ /*
+ * VM_DONTCOPY - Do not copy this vma on fork
+ * VM_DONTEXPAND - Cannot expand with mremap()
+ * VM_RESERVED - Count as reserved_vm like IO
+ * VM_PFNMAP - Page-ranges managed without "struct page"
+ * VM_IO - Memory mapped I/O or similar
+ *
+ * We do not want to copy this VMA automatically on a fork(),
+ * expand this VMA due to mremap() or swap out these pages since
+ * the VMA is actually backed by physical pages in the remote
+ * node's physical memory and not via a struct page.
+ */
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP | VM_PFNMAP;
+#else
+ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP;
+#endif
+
+ if (!is_self_scifdev(ep->remote_dev))
+ ((vma)->vm_flags) |= VM_IO;
+
+ /* Map this range of windows */
+ if ((err = micscif_rma_list_mmap(window,
+ start_offset, nr_pages, vma))) {
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ goto error;
+ }
+ /* Set up the driver call back */
+ vma->vm_ops = &micscif_vm_ops;
+ ((vma)->vm_private_data) = vmapvt;
+ /*
+ * For 1 page sized VMAs the kernel (remap_pfn_range) replaces the
+ * offset in the VMA with the pfn, so in that case save off the
+ * original offset, since the page sized VMA can't be split into
+ * smaller VMAs the offset is not going to change.
+ */
+ if (nr_pages == 1) {
+ vmapvt->offset = start_offset;
+ vmapvt->valid_offset = true;
+ }
+ err = 0;
+error:
+ mutex_unlock(&ep->rma_info.rma_lock);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ if (err) {
+ micscif_destroy_node_dep(ep->remote_dev, nr_pages);
+ kfree(vmapvt);
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ micscif_rma_put_task(ep, nr_pages);
+ }
+ return err;
+}
+
+/**
+ * scif_readfrom() - Read SCIF offset data from remote connection
+ * @epd: endpoint descriptor
+ * @loffset: offset in local registered address space to which to copy
+ * @len: length of range to copy
+ * @roffset: offset in remote registered address space from which to copy
+ * @flags: flags
+ *
+ * Return Values
+ * Upon successful completion, scif_readfrom() returns zero
+ * else an apt error is returned as documented in scif.h.
+ */
+int
+scif_readfrom(scif_epd_t epd, off_t loffset, size_t len,
+ off_t roffset, int flags)
+{
+ int ret;
+ get_kref_count(epd);
+ ret = __scif_readfrom(epd, loffset, len, roffset, flags);
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_readfrom);
+
+/**
+ * scif_writeto() - Send SCIF offset data to remote connection
+ * @epd: endpoint descriptor
+ * @loffset: offset in local registered address space from which to copy
+ * @len: length of range to copy
+ * @roffset: offset in remote registered address space to which to copy
+ * @flags: flags
+ *
+ * Return Values
+ * Upon successful completion, scif_writeto() returns zero
+ * else an apt error is returned as documented in scif.h.
+ *
+ */
+int scif_writeto(scif_epd_t epd, off_t loffset, size_t len,
+ off_t roffset, int flags)
+{
+ int ret;
+ get_kref_count(epd);
+ ret = __scif_writeto(epd, loffset, len, roffset, flags);
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_writeto);
+
+#define HOST_LOOPB_MAGIC_MARK 0xdead
+
+/**
+ * scif_fence_mark:
+ * @epd: endpoint descriptor
+ * @flags: control flags
+ * @mark: marked handle returned as output.
+ *
+ * scif_fence_mark() returns after marking the current set of all uncompleted
+ * RMAs initiated through the endpoint epd or marking the current set of all
+ * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are
+ * marked with a value returned in mark. The application may subsequently
+ * await completion of all RMAs so marked.
+ *
+ * Return Values
+ * Upon successful completion, scif_fence_mark() returns 0;
+ * else an apt error is returned as documented in scif.h.
+ */
+int __scif_fence_mark(scif_epd_t epd, int flags, int *mark)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ int err = 0;
+
+ pr_debug("SCIFAPI fence_mark: ep %p %s flags 0x%x mark 0x%x\n",
+ ep, scif_ep_states[ep->state], flags, *mark);
+
+ if ((err = verify_epd(ep)))
+ return err;
+
+ /* Invalid flags? */
+ if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER))
+ return -EINVAL;
+
+ /* At least one of init self or peer RMA should be set */
+ if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER)))
+ return -EINVAL;
+
+ /* Exactly one of init self or peer RMA should be set but not both */
+ if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER))
+ return -EINVAL;
+
+#ifndef _MIC_SCIF_
+ /*
+ * Host Loopback does not need to use DMA.
+ * Return a valid mark to be symmetric.
+ */
+ if (is_self_scifdev(ep->remote_dev)) {
+ *mark = HOST_LOOPB_MAGIC_MARK;
+ return 0;
+ }
+#endif
+
+ if (flags & SCIF_FENCE_INIT_SELF) {
+ if ((*mark = micscif_fence_mark(epd)) < 0)
+ err = *mark;
+ } else {
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ err = micscif_send_fence_mark(ep, mark);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ }
+ if (err)
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+
+ pr_debug("SCIFAPI fence_mark: ep %p %s flags 0x%x mark 0x%x err %d\n",
+ ep, scif_ep_states[ep->state], flags, *mark, err);
+ return err;
+}
+
+int scif_fence_mark(scif_epd_t epd, int flags, int *mark)
+{
+ int ret;
+ get_kref_count(epd);
+ ret = __scif_fence_mark(epd, flags, mark);
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_fence_mark);
+
+/**
+ * scif_fence_wait:
+ * @epd: endpoint descriptor
+ * @mark: mark request.
+ *
+ * scif_fence_wait() returns after all RMAs marked with mark have completed.
+ *
+ * Return Values
+ * Upon successful completion, scif_fence_wait() returns 0;
+ * else an apt error is returned as documented in scif.h.
+ */
+int __scif_fence_wait(scif_epd_t epd, int mark)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ int err = 0;
+
+ pr_debug("SCIFAPI fence_wait: ep %p %s mark 0x%x\n",
+ ep, scif_ep_states[ep->state], mark);
+
+ if ((err = verify_epd(ep)))
+ return err;
+
+#ifndef _MIC_SCIF_
+ /*
+ * Host Loopback does not need to use DMA.
+ * The only valid mark provided is 0 so simply
+ * return success if the mark is valid.
+ */
+ if (is_self_scifdev(ep->remote_dev)) {
+ if (HOST_LOOPB_MAGIC_MARK == mark)
+ return 0;
+ else
+ return -EINVAL;
+ }
+#endif
+ if (mark & SCIF_REMOTE_FENCE) {
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ err = micscif_send_fence_wait(epd, mark);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ } else {
+ err = dma_mark_wait(epd->rma_info.dma_chan, mark, true);
+ if (!err && atomic_read(&ep->rma_info.tw_refcount))
+ queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work);
+ }
+
+ if (err < 0)
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ return err;
+}
+
+int scif_fence_wait(scif_epd_t epd, int mark)
+{
+ int ret;
+ get_kref_count(epd);
+ ret = __scif_fence_wait(epd, mark);
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_fence_wait);
+
+/*
+ * scif_fence_signal:
+ * @loff: local offset
+ * @lval: local value to write to loffset
+ * @roff: remote offset
+ * @rval: remote value to write to roffset
+ * @flags: flags
+ *
+ * scif_fence_signal() returns after marking the current set of all
+ * uncompleted RMAs initiated through the endpoint epd or marking
+ * the current set of all uncompleted RMAs initiated through the peer
+ * of endpoint epd.
+ *
+ * Return Values
+ * Upon successful completion, scif_fence_signal() returns 0;
+ * else an apt error is returned as documented in scif.h.
+ */
+int __scif_fence_signal(scif_epd_t epd, off_t loff, uint64_t lval,
+ off_t roff, uint64_t rval, int flags)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ int err = 0;
+
+ pr_debug("SCIFAPI fence_signal: ep %p %s loff 0x%lx lval 0x%llx "
+ "roff 0x%lx rval 0x%llx flags 0x%x\n",
+ ep, scif_ep_states[ep->state], loff, lval, roff, rval, flags);
+
+ if ((err = verify_epd(ep)))
+ return err;
+
+ /* Invalid flags? */
+ if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER |
+ SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE))
+ return -EINVAL;
+
+ /* At least one of init self or peer RMA should be set */
+ if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER)))
+ return -EINVAL;
+
+ /* Exactly one of init self or peer RMA should be set but not both */
+ if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER))
+ return -EINVAL;
+
+ /* At least one of SCIF_SIGNAL_LOCAL or SCIF_SIGNAL_REMOTE required */
+ if (!(flags & (SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE)))
+ return -EINVAL;
+
+ /* Only Dword offsets allowed */
+ if ((flags & SCIF_SIGNAL_LOCAL) && (loff & (sizeof(uint32_t) - 1)))
+ return -EINVAL;
+
+ /* Only Dword aligned offsets allowed */
+ if ((flags & SCIF_SIGNAL_REMOTE) && (roff & (sizeof(uint32_t) - 1)))
+ return -EINVAL;
+
+ if (flags & SCIF_FENCE_INIT_PEER) {
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ err = micscif_send_fence_signal(epd, roff,
+ rval, loff, lval, flags);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ } else {
+ /* Local Signal in Local RAS */
+ if (flags & SCIF_SIGNAL_LOCAL)
+ if ((err = micscif_prog_signal(epd, loff,
+ lval, RMA_WINDOW_SELF)))
+ goto error_ret;
+
+ /* Signal in Remote RAS */
+ if (flags & SCIF_SIGNAL_REMOTE) {
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ err = micscif_prog_signal(epd, roff,
+ rval, RMA_WINDOW_PEER);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ }
+ }
+error_ret:
+ if (err)
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ else if (atomic_read(&ep->rma_info.tw_refcount))
+ queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work);
+ return err;
+}
+
+int scif_fence_signal(scif_epd_t epd, off_t loff, uint64_t lval,
+ off_t roff, uint64_t rval, int flags)
+{
+ int ret;
+ get_kref_count(epd);
+ ret = __scif_fence_signal(epd, loff, lval, roff, rval, flags);
+ put_kref_count(epd);
+ return ret;
+}
+EXPORT_SYMBOL(scif_fence_signal);
+
+/**
+ * scif_get_nodeIDs - Return information about online nodes
+ * @nodes: array space reserved for returning online node IDs
+ * @len: number of entries on the nodes array
+ * @self: address to place the node ID of this system
+ *
+ * Return Values
+ * scif_get_nodeIDs() returns the total number of scif nodes
+ * (including host) in the system
+ */
+int
+scif_get_nodeIDs(uint16_t *nodes, int len, uint16_t *self)
+{
+ int online = 0;
+ int offset = 0;
+ int node;
+#ifdef _MIC_SCIF_
+ micscif_get_node_info();
+#endif
+
+ *self = ms_info.mi_nodeid;
+ mutex_lock(&ms_info.mi_conflock);
+ len = SCIF_MIN(len, (int32_t)ms_info.mi_total);
+ for (node = 0; node <=(int32_t)ms_info.mi_maxid; node++) {
+ if (ms_info.mi_mask & (1UL << node)) {
+ online++;
+ if (offset < len)
+ nodes[offset++] = node;
+ }
+ }
+ pr_debug("SCIFAPI get_nodeIDs total %d online %d filled in %d nodes\n",
+ ms_info.mi_total, online, len);
+ mutex_unlock(&ms_info.mi_conflock);
+
+ return online;
+}
+
+EXPORT_SYMBOL(scif_get_nodeIDs);
+
+/**
+ * micscif_pci_dev:
+ * @node: node ID
+ *
+ * Return the pci_dev associated with a node.
+ */
+int micscif_pci_dev(uint16_t node, struct pci_dev **pdev)
+{
+#ifdef _MIC_SCIF_
+ /* This *is* a PCI device, therefore no pdev to return. */
+ return -ENODEV;
+#else
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(node - 1);
+ *pdev = mic_ctx->bi_pdev;
+ return 0;
+#endif
+}
+
+#ifndef _MIC_SCIF_
+/**
+ * micscif_pci_info:
+ * @node: node ID
+ *
+ * Populate the pci device info pointer associated with a node.
+ */
+int micscif_pci_info(uint16_t node, struct scif_pci_info *dev)
+{
+ int i;
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(node - 1);
+ struct pci_dev *pdev;
+
+ if (!mic_ctx)
+ return -ENODEV;
+
+ dev->pdev = pdev = mic_ctx->bi_pdev;
+ for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+ if (!pci_resource_start(pdev, i)) {
+ dev->va[i] = NULL;
+ continue;
+ }
+ if (pci_resource_flags(pdev, i) & IORESOURCE_PREFETCH) {
+ /* TODO: Change comparison check for KNL. */
+ if (pci_resource_start(pdev, i) == mic_ctx->aper.pa)
+ dev->va[i] = mic_ctx->aper.va;
+ else
+ dev->va[i] = NULL;
+ } else {
+ dev->va[i] = mic_ctx->mmio.va;
+ }
+ }
+ return 0;
+}
+#endif
+
+/**
+ * scif_pci_info - Populate the pci device info pointer associated with a node
+ * @node: the node to query
+ * @scif_pdev: The scif_pci_info structure to populate.
+ *
+ * scif_pci_info() populates the provided scif_pci_info structure
+ * associated with a node. The requested node ID cannot be the same as
+ * the current node. This routine may only return success when called from
+ * the host.
+ *
+ * Return Values
+ * Upon successful completion, scif_pci_info() returns 0; otherwise the
+ * an appropriate error is returned as documented in scif.h.
+ */
+int scif_pci_info(uint16_t node, struct scif_pci_info *dev)
+{
+#ifdef _MIC_SCIF_
+ return -EINVAL;
+#else
+ if (node > ms_info.mi_maxid)
+ return -EINVAL;
+
+ if ((scif_dev[node].sd_state == SCIFDEV_NOTPRESENT) ||
+ is_self_scifdev(&scif_dev[node]))
+ return -ENODEV;
+
+ return micscif_pci_info(node, dev);
+#endif
+}
+EXPORT_SYMBOL(scif_pci_info);
+
+/*
+ * DEBUG helper functions
+ */
+void
+print_ep_state(struct endpt *ep, char *label)
+{
+ if (ep)
+ printk("%s: EP %p state %s\n",
+ label, ep, scif_ep_states[ep->state]);
+ else
+ printk("%s: EP %p\n state ?\n", label, ep);
+}
+
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "mic/micscif.h"
+#ifndef _MIC_SCIF_
+#include "mic_common.h"
+#endif
+#include "scif.h"
+#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
+
+#include <linux/module.h>
+
+static char *window_type[] = {
+ "NONE",
+ "SELF",
+ "PEER"};
+
+static char *scifdev_state[] = {
+ "SCIFDEV_NOTPRESENT",
+ "SCIFDEV_INIT",
+ "SCIFDEV_RUNNING",
+ "SCIFDEV_SLEEPING",
+ "SCIFDEV_STOPPING",
+ "SCIFDEV_STOPPED"};
+
+static struct proc_dir_entry *scif_proc;
+static struct dentry *mic_debug = NULL;
+
+#define DEBUG_LEN 10
+
+static int
+scif_ep_show(struct seq_file *m, void *data)
+{
+ struct endpt *ep;
+ struct list_head *pos;
+ unsigned long sflags;
+
+ seq_printf(m, "EP Address State Port Peer Remote Ep Address\n");
+ seq_printf(m, "=================================================================\n");
+ spin_lock_irqsave(&ms_info.mi_eplock, sflags);
+ list_for_each(pos, &ms_info.mi_listen) {
+ ep = list_entry(pos, struct endpt, list);
+ seq_printf(m, "%p %s %6d\n",
+ ep, scif_ep_states[ep->state], ep->port.port);
+ }
+ spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
+
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ list_for_each(pos, &ms_info.mi_connected) {
+ ep = list_entry(pos, struct endpt, list);
+ seq_printf(m, "%p %s %6d %2d:%-6d %p\n",
+ ep, scif_ep_states[ep->state], ep->port.port, ep->peer.node,
+ ep->peer.port, (void *)ep->remote_ep);
+ }
+ list_for_each(pos, &ms_info.mi_disconnected) {
+ ep = list_entry(pos, struct endpt, list);
+ seq_printf(m, "%p %s %6d %2d:%-6d %p\n",
+ ep, scif_ep_states[ep->state], ep->port.port, ep->peer.node,
+ ep->peer.port, (void *)ep->remote_ep);
+ }
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+
+ seq_printf(m, "EP Address State Port Peer Remote Ep Address reg_list "
+ "remote_reg_list mmn_list tw_refcount tcw_refcount mi_rma mi_rma_tc "
+ "task_list mic_mmu_notif_cleanup\n");
+ seq_printf(m, "=================================================================\n");
+ spin_lock_irqsave(&ms_info.mi_eplock, sflags);
+ list_for_each(pos, &ms_info.mi_zombie) {
+ ep = list_entry(pos, struct endpt, list);
+ seq_printf(m, "%p %s %6d %2d:%-6d %p %d %d %d %d %d %d %d %d %d\n",
+ ep, scif_ep_states[ep->state], ep->port.port, ep->peer.node,
+ ep->peer.port, (void *)ep->remote_ep,
+ list_empty(&ep->rma_info.reg_list),
+ list_empty(&ep->rma_info.remote_reg_list),
+ list_empty(&ep->rma_info.mmn_list),
+ atomic_read(&ep->rma_info.tw_refcount),
+ atomic_read(&ep->rma_info.tcw_refcount),
+ list_empty(&ms_info.mi_rma),
+ list_empty(&ms_info.mi_rma_tc),
+ list_empty(&ep->rma_info.task_list),
+#ifdef CONFIG_MMU_NOTIFIER
+ list_empty(&ms_info.mi_mmu_notif_cleanup)
+#else
+ -1
+#endif
+ );
+ }
+ spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
+
+ return 0;
+}
+
+static int
+scif_ep_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, scif_ep_show, NULL);
+}
+
+struct file_operations scif_ep_fops = {
+ .open = scif_ep_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+
+static int
+scif_rma_window_show(struct seq_file *m, void *data)
+{
+ struct endpt *ep;
+ struct list_head *pos, *item, *tmp;
+ unsigned long sflags;
+ struct reg_range_t *window;
+
+ seq_printf(m, "SCIF Connected EP RMA Window Info\n");
+ seq_printf(m, "=================================================================\n");
+ seq_printf(m, "%-16s\t%-16s %-16s %-16s %-8s %-8s %-8s\n",
+ "Endpoint", "Type", "Offset", "NumPages", "Prot", "Ref_Count", "Unreg State");
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ list_for_each(pos, &ms_info.mi_connected) {
+ ep = list_entry(pos, struct endpt, list);
+ if (mutex_trylock(&ep->rma_info.rma_lock)) {
+ list_for_each_safe(item, tmp, &ep->rma_info.reg_list) {
+ window = list_entry(item, struct reg_range_t, list_member);
+ seq_printf(m,
+ "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n",
+ ep, window_type[window->type], window->offset,
+ window->nr_pages, window->prot, window->ref_count,
+ window->unreg_state);
+ }
+ list_for_each_safe(item, tmp, &ep->rma_info.remote_reg_list) {
+ window = list_entry(item, struct reg_range_t, list_member);
+ seq_printf(m,
+ "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n",
+ ep, window_type[window->type], window->offset,
+ window->nr_pages, window->prot, window->ref_count,
+ window->unreg_state);
+ }
+ mutex_unlock(&ep->rma_info.rma_lock);
+ } else
+ seq_printf(m,
+ "Try Again, some other thread has the RMA lock for ep %p\n",
+ ep);
+ }
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+
+ seq_printf(m, "=================================================================\n");
+ seq_printf(m, "SCIF Zombie EP RMA Window Info\n");
+ spin_lock_irqsave(&ms_info.mi_eplock, sflags);
+ list_for_each(pos, &ms_info.mi_zombie) {
+ ep = list_entry(pos, struct endpt, list);
+ if (mutex_trylock(&ep->rma_info.rma_lock)) {
+ list_for_each_safe(item, tmp, &ep->rma_info.reg_list) {
+ window = list_entry(item, struct reg_range_t, list_member);
+ seq_printf(m,
+ "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n",
+ ep, window_type[window->type], window->offset,
+ window->nr_pages, window->prot, window->ref_count,
+ window->unreg_state);
+ }
+ list_for_each_safe(item, tmp, &ep->rma_info.remote_reg_list) {
+ window = list_entry(item, struct reg_range_t, list_member);
+ seq_printf(m,
+ "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n",
+ ep, window_type[window->type], window->offset,
+ window->nr_pages, window->prot, window->ref_count,
+ window->unreg_state);
+ }
+ mutex_unlock(&ep->rma_info.rma_lock);
+ } else
+ seq_printf(m,
+ "Try Again, some other thread has the RMA lock for ep %p\n",
+ ep);
+ }
+ spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
+ seq_printf(m, "=================================================================\n");
+ seq_printf(m, "%-16s\t%-16s %-16s %-16s %-8s %-8s %-8s\n",
+ "Endpoint", "Type", "Offset", "NumPages", "Prot", "Ref_Count", "Unreg State");
+ spin_lock(&ms_info.mi_rmalock);
+ list_for_each_safe(item, tmp, &ms_info.mi_rma) {
+ window = list_entry(item,
+ struct reg_range_t, list_member);
+ ep = (struct endpt *)window->ep;
+ seq_printf(m, "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n",
+ ep, window_type[window->type], window->offset,
+ window->nr_pages, window->prot, window->ref_count,
+ window->unreg_state);
+ }
+ spin_unlock(&ms_info.mi_rmalock);
+
+ return 0;
+}
+
+static int
+scif_rma_window_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, scif_rma_window_show, NULL);
+}
+
+struct file_operations scif_rma_window_fops = {
+ .open = scif_rma_window_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int
+scif_rma_xfer_show(struct seq_file *m, void *data)
+{
+ struct endpt *ep;
+ struct list_head *pos;
+ unsigned long sflags;
+
+ seq_printf(m, "SCIF RMA Debug\n");
+ seq_printf(m, "=================================================================\n");
+ seq_printf(m, "%-16s\t %-16s %-16s %-16s\n",
+ "Endpoint", "Fence Ref Count", "Temp Window Ref Count", "DMA CHANNEL");
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ list_for_each(pos, &ms_info.mi_connected) {
+ ep = list_entry(pos, struct endpt, list);
+ seq_printf(m, "%-16p\t%-16d %-16d %-16d\n",
+ ep, ep->rma_info.fence_refcount,
+ atomic_read(&ep->rma_info.tw_refcount),
+ ep->rma_info.dma_chan ? get_chan_num(ep->rma_info.dma_chan): -1);
+ }
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+ return 0;
+}
+
+static int
+scif_rma_xfer_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, scif_rma_xfer_show, NULL);
+}
+
+struct file_operations scif_rma_xfer_fops = {
+ .open = scif_rma_xfer_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int
+scif_dev_show(struct seq_file *m, void *data)
+{
+ int node;
+
+ seq_printf(m, "Total Nodes %d Self Node Id %d Maxid %d\n",
+ ms_info.mi_total, ms_info.mi_nodeid, ms_info.mi_maxid);
+
+ seq_printf(m, "%-16s\t%-16s %-16s\t%-16s\t%-8s\t%-8s\t%-8s\n",
+ "node_id", "state", "scif_ref_cnt", "scif_map_ref_cnt",
+ "wait_status", "conn count", "numa_node");
+
+ for (node = 0; node <= ms_info.mi_maxid; node++)
+ seq_printf(m, "%-16d\t%-16s\t0x%-16lx\t%-16d\t%-16lld\t%-16d\t%-16d\n",
+ scif_dev[node].sd_node, scifdev_state[scif_dev[node].sd_state],
+ atomic_long_read(&scif_dev[node].scif_ref_cnt),
+ scif_dev[node].scif_map_ref_cnt,
+ scif_dev[node].sd_wait_status,
+ scif_dev[node].num_active_conn,
+ scif_dev[node].sd_numa_node);
+
+ return 0;
+}
+
+static int
+scif_dev_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, scif_dev_show, NULL);
+}
+
+struct file_operations scif_dev_fops = {
+ .open = scif_dev_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int
+scif_debug_show(struct seq_file *m, void *data)
+{
+ seq_printf(m, "Num gtt_entries %d\n", ms_info.nr_gtt_entries);
+ /*
+ * Tracking the number of zombies for debug.
+ * Need to make sure they are not being left behind forever.
+ */
+ seq_printf(m, "Num Zombie Endpoints %d\n", ms_info.mi_nr_zombies);
+ seq_printf(m, "Watchdog timeout %d\n", ms_info.mi_watchdog_to);
+ seq_printf(m, "Watchdog enabled %d\n", ms_info.mi_watchdog_enabled);
+ seq_printf(m, "Watchdog auto reboot %d\n", ms_info.mi_watchdog_auto_reboot);
+ seq_printf(m, "Huge Pages Enabled %d Detected 2mb %lld 4k %lld\n",
+ mic_huge_page_enable, ms_info.nr_2mb_pages, ms_info.nr_4k_pages);
+#ifdef RMA_DEBUG
+ seq_printf(m, "rma_alloc_cnt %ld rma_pin_cnt %ld mmu_notif %ld rma_unaligned_cpu_cnt %ld\n",
+ atomic_long_read(&ms_info.rma_alloc_cnt),
+ atomic_long_read(&ms_info.rma_pin_cnt),
+ atomic_long_read(&ms_info.mmu_notif_cnt),
+ atomic_long_read(&ms_info.rma_unaligned_cpu_cnt));
+#endif
+ seq_printf(m, "List empty? mi_uaccept %d mi_listen %d mi_zombie %d "
+ "mi_connected %d mi_disconnected %d\n",
+ list_empty(&ms_info.mi_uaccept),
+ list_empty(&ms_info.mi_listen),
+ list_empty(&ms_info.mi_zombie),
+ list_empty(&ms_info.mi_connected),
+ list_empty(&ms_info.mi_disconnected));
+
+ return 0;
+}
+
+static int
+scif_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, scif_debug_show, NULL);
+}
+
+struct file_operations scif_debug_fops = {
+ .open = scif_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int
+scif_suspend_show(struct seq_file *m, void *data)
+{
+ int node;
+ uint64_t ret;
+ seq_printf(m, "Removing Nodes mask 0x7\n");
+
+ for (node = 1; node < ms_info.mi_total; node++) {
+ ret = micscif_disconnect_node(node, 0 , 1);
+ seq_printf(m, "Node %d requested disconnect. ret = %lld\n",
+ node, ret);
+ }
+
+ return 0;
+}
+
+static int
+scif_suspend_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, scif_suspend_show, NULL);
+}
+
+struct file_operations scif_suspend_fops = {
+ .open = scif_suspend_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int
+scif_cache_limit_show(struct seq_file *m, void *data)
+{
+ seq_printf(m, "reg_cache_limit = 0x%lx\n", ms_info.mi_rma_tc_limit);
+ return 0;
+}
+
+static int
+scif_cache_limit_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, scif_cache_limit_show, NULL);
+}
+
+struct file_operations scif_cache_limit_fops = {
+ .open = scif_cache_limit_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#else // LINUX VERSION 3.10
+
+static int
+scif_rma_window_read(char *buf, char **start, off_t offset, int len, int *eof, void *data)
+{
+ struct endpt *ep;
+ struct list_head *pos, *item, *tmp;
+ unsigned long sflags;
+ int l = 0;
+ struct reg_range_t *window;
+
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "SCIF Connected EP RMA Window Info\n");
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "=================================================================\n");
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "%-16s\t%-16s %-16s %-16s %-8s %-8s %-8s\n",
+ "Endpoint", "Type", "Offset", "NumPages", "Prot", "Ref_Count", "Unreg State");
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ list_for_each(pos, &ms_info.mi_connected) {
+ ep = list_entry(pos, struct endpt, list);
+ if (mutex_trylock(&ep->rma_info.rma_lock)) {
+ list_for_each_safe(item, tmp, &ep->rma_info.reg_list) {
+ window = list_entry(item, struct reg_range_t, list_member);
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n",
+ ep, window_type[window->type], window->offset,
+ window->nr_pages, window->prot, window->ref_count,
+ window->unreg_state);
+ }
+ list_for_each_safe(item, tmp, &ep->rma_info.remote_reg_list) {
+ window = list_entry(item, struct reg_range_t, list_member);
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n",
+ ep, window_type[window->type], window->offset,
+ window->nr_pages, window->prot, window->ref_count,
+ window->unreg_state);
+ }
+ mutex_unlock(&ep->rma_info.rma_lock);
+ } else
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "Try Again, some other thread has the RMA lock for ep %p\n",
+ ep);
+ }
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "=================================================================\n");
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "SCIF Zombie EP RMA Window Info\n");
+ spin_lock_irqsave(&ms_info.mi_eplock, sflags);
+ list_for_each(pos, &ms_info.mi_zombie) {
+ ep = list_entry(pos, struct endpt, list);
+ if (mutex_trylock(&ep->rma_info.rma_lock)) {
+ list_for_each_safe(item, tmp, &ep->rma_info.reg_list) {
+ window = list_entry(item, struct reg_range_t, list_member);
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n",
+ ep, window_type[window->type], window->offset,
+ window->nr_pages, window->prot, window->ref_count,
+ window->unreg_state);
+ }
+ list_for_each_safe(item, tmp, &ep->rma_info.remote_reg_list) {
+ window = list_entry(item, struct reg_range_t, list_member);
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n",
+ ep, window_type[window->type], window->offset,
+ window->nr_pages, window->prot, window->ref_count,
+ window->unreg_state);
+ }
+ mutex_unlock(&ep->rma_info.rma_lock);
+ } else
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "Try Again, some other thread has the RMA lock for ep %p\n",
+ ep);
+ }
+ spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "=================================================================\n");
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "%-16s\t%-16s %-16s %-16s %-8s %-8s %-8s\n",
+ "Endpoint", "Type", "Offset", "NumPages", "Prot", "Ref_Count", "Unreg State");
+ spin_lock(&ms_info.mi_rmalock);
+ list_for_each_safe(item, tmp, &ms_info.mi_rma) {
+ window = list_entry(item,
+ struct reg_range_t, list_member);
+ ep = (struct endpt *)window->ep;
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "%-16p\t%-16s 0x%-16llx %-16lld %-8d %-8d %-8d\n",
+ ep, window_type[window->type], window->offset,
+ window->nr_pages, window->prot, window->ref_count,
+ window->unreg_state);
+ }
+ spin_unlock(&ms_info.mi_rmalock);
+
+ *eof = 1;
+ return l;
+}
+
+static int
+scif_rma_xfer_read(char *buf, char **start, off_t offset, int len, int *eof, void *data)
+{
+ struct endpt *ep;
+ struct list_head *pos;
+ unsigned long sflags;
+ int l = 0;
+
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 , "SCIF RMA Debug\n");
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "=================================================================\n");
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 , "%-16s\t %-16s %-16s %-16s\n",
+ "Endpoint", "Fence Ref Count", "Temp Window Ref Count", "DMA CHANNEL");
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ list_for_each(pos, &ms_info.mi_connected) {
+ ep = list_entry(pos, struct endpt, list);
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 , "%-16p\t%-16d %-16d %-16d\n",
+ ep, ep->rma_info.fence_refcount,
+ atomic_read(&ep->rma_info.tw_refcount),
+ ep->rma_info.dma_chan ? get_chan_num(ep->rma_info.dma_chan): -1);
+ }
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+
+ *eof = 1;
+ return l;
+}
+
+/* Place Holder for generic SCIF debug information */
+static int
+scif_debug_read(char *buf, char **start, off_t offset, int len, int *eof, void *data)
+{
+ int l = 0;
+
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "Num gtt_entries %d\n", ms_info.nr_gtt_entries);
+ /*
+ * Tracking the number of zombies for debug.
+ * Need to make sure they are not being left behind forever.
+ */
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "Num Zombie Endpoints %d\n", ms_info.mi_nr_zombies);
+
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "Watchdog timeout %d\n", ms_info.mi_watchdog_to);
+
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "Watchdog enabled %d\n", ms_info.mi_watchdog_enabled);
+
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "Watchdog auto reboot %d\n", ms_info.mi_watchdog_auto_reboot);
+
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "Huge Pages Enabled %d Detected 2mb %lld 4k %lld\n",
+ mic_huge_page_enable, ms_info.nr_2mb_pages, ms_info.nr_4k_pages);
+#ifdef RMA_DEBUG
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "mm ref cnt %ld rma_alloc_cnt %ld rma_pin_cnt %ld mmu_notif %ld rma_unaligned_cpu_cnt %ld\n",
+ atomic_long_read(&ms_info.rma_mm_cnt),
+ atomic_long_read(&ms_info.rma_alloc_cnt),
+ atomic_long_read(&ms_info.rma_pin_cnt),
+ atomic_long_read(&ms_info.mmu_notif_cnt),
+ atomic_long_read(&ms_info.rma_unaligned_cpu_cnt));
+#endif
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "List empty? mi_uaccept %d mi_listen %d mi_zombie %d "
+ "mi_connected %d mi_disconnected %d\n",
+ list_empty(&ms_info.mi_uaccept),
+ list_empty(&ms_info.mi_listen),
+ list_empty(&ms_info.mi_zombie),
+ list_empty(&ms_info.mi_connected),
+ list_empty(&ms_info.mi_disconnected));
+
+ *eof = 1;
+ return l;
+}
+
+static int
+scif_dev_info(char *buf, char **start, off_t offset, int len, int *eof, void *data)
+{
+ int l = 0;
+ int node;
+
+#ifdef _MIC_SCIF_
+ micscif_get_node_info();
+
+ mutex_lock(&ms_info.mi_conflock);
+#endif
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "Total Nodes %d Self Node Id %d Maxid %d\n",
+ ms_info.mi_total, ms_info.mi_nodeid, ms_info.mi_maxid);
+
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "%-16s\t%-16s %-16s\t%-16s\t%-8s\t%-8s\t%-8s\n",
+ "node_id", "state", "scif_ref_cnt", "scif_map_ref_cnt",
+ "wait_status", "conn count", "numa_node");
+
+ for (node = 0; node <= ms_info.mi_maxid; node++)
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "%-16d\t%-16s\t0x%-16lx\t%-16d\t%-16lld\t%-16d\t%-16d\n",
+ scif_dev[node].sd_node, scifdev_state[scif_dev[node].sd_state],
+ atomic_long_read(&scif_dev[node].scif_ref_cnt),
+ scif_dev[node].scif_map_ref_cnt,
+ scif_dev[node].sd_wait_status,
+ scif_dev[node].num_active_conn,
+ scif_dev[node].sd_numa_node);
+#ifdef _MIC_SCIF_
+ mutex_unlock(&ms_info.mi_conflock);
+#endif
+
+ *eof = 1;
+ return l;
+}
+
+static int
+scif_suspend(char *buf, char **start, off_t offset, int len, int *eof, void *data)
+{
+ int l = 0;
+
+#ifdef _MIC_SCIF_
+ micscif_suspend_handler(NULL, 0, NULL);
+#else
+ {
+ int node;
+ uint64_t ret;
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "Removing Nodes mask 0x7\n");
+ for (node = 1; node < ms_info.mi_total; node++) {
+ ret = micscif_disconnect_node(node, 0 , 1);
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "Node %d requested disconnect. ret = %lld\n",
+ node, ret);
+ }
+ }
+#endif
+
+ *eof = 1;
+ return l;
+}
+
+#ifdef _MIC_SCIF_
+static int
+scif_crash(char *buf, char **start, off_t offset, int len, int *eof, void *data)
+{
+ int l = 0;
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "%s %d Crash the Card to test Lost Nodes\n", __func__, __LINE__);
+ panic("Test Lost Node! Crash the card intentionally\n");
+ *eof = 1;
+ return l;
+}
+
+static int
+scif_bugon(char *buf, char **start, off_t offset, int len, int *eof, void *data)
+{
+ int l = 0;
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "%s %d Bug on the Card to test Lost Nodes\n", __func__, __LINE__);
+ BUG_ON(1);
+ *eof = 1;
+ return l;
+}
+#endif
+
+static int
+scif_fail_suspend(char *buf, char **start, off_t offset, int len, int *eof, void *data)
+{
+ int l = 0;
+
+#ifdef _MIC_SCIF_
+ micscif_fail_suspend_handler(NULL, 0, NULL);
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "Failing Suspend\n");
+#endif
+
+ *eof = 1;
+ return l;
+}
+
+static int
+scif_resume(char *buf, char **start, off_t offset, int len, int *eof, void *data)
+{
+ int l = 0;
+
+#ifdef _MIC_SCIF_
+ micscif_resume_handler(NULL, 0, NULL);
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0,
+ "Resuming/Waking up node\n");
+#endif
+
+ *eof = 1;
+ return l;
+}
+
+static int
+scif_get_reg_cache_limit(char *buf, char **start, off_t offset, int len, int *eof, void *data)
+{
+ int l = 0;
+
+ l += snprintf(buf + l, len - l > 0 ? len - l : 0 ,
+ "reg_cache_limit = 0x%lx\n", ms_info.mi_rma_tc_limit);
+ *eof = 1;
+ return l;
+}
+
+static int
+scif_set_reg_cache_limit(struct file *file, const char __user *buffer,
+ unsigned long len, void *unused)
+{
+ unsigned long data = 0;
+ char *p;
+ if (!(p = kzalloc(len, GFP_KERNEL)))
+ return -ENOMEM;
+ if (copy_from_user(p, buffer, len))
+ return -EFAULT;
+ data = simple_strtoul(p, NULL, 0);
+ ms_info.mi_rma_tc_limit = data;
+ return len;
+}
+#endif
+
+#ifdef _MIC_SCIF_
+static int smpt_seq_show(struct seq_file *s, void *pos)
+{
+ volatile uint8_t *mm_sbox = scif_dev[SCIF_HOST_NODE].mm_sbox;
+ uint32_t smpt_reg_offset = SBOX_SMPT00;
+ uint32_t smpt_reg_val;
+ int i;
+
+ seq_printf(s,
+ "=================================================================\n");
+ seq_printf(s,"%-11s| %-15s %-14s %-5s \n",
+ "SMPT entry", "SMPT reg value", "DMA addr", "SNOOP");
+ seq_printf(s,
+ "=================================================================\n");
+
+ for (i = 0; i < NUM_SMPT_ENTRIES_IN_USE; i++) {
+ smpt_reg_val = readl(mm_sbox + smpt_reg_offset);
+ seq_printf(s,"%-11d| %-#15x %-#14llx %-5s \n",
+ i, smpt_reg_val, ((uint64_t)smpt_reg_val >> 2ULL) << MIC_SYSTEM_PAGE_SHIFT,
+ (smpt_reg_val & 0x1) ? "OFF" : "ON");
+ smpt_reg_offset += 4;
+ }
+
+ seq_printf(s,
+ "=================================================================\n");
+ return 0;
+}
+
+#else
+static int smpt_seq_show(struct seq_file *s, void *pos)
+{
+ uint64_t bid = (uint64_t)s->private;
+ mic_ctx_t *mic_ctx;
+ int i;
+ unsigned long flags;
+
+ mic_ctx = get_per_dev_ctx(bid);
+ seq_printf(s,
+ "=================================================================\n");
+ seq_printf(s,"Board %-2d |%-10s| %-14s %-10s \n",
+ (int)bid + 1, "SMPT entry", "DMA addr", "Reference Count");
+ seq_printf(s,
+ "=================================================================\n");
+
+ if (mic_ctx && mic_ctx->mic_smpt) {
+ spin_lock_irqsave(&mic_ctx->smpt_lock, flags);
+ for (i = 0; i < NUM_SMPT_ENTRIES_IN_USE; i++) {
+ seq_printf(s,"%9s|%-10d| %-#14llx %-10lld \n",
+ " ", i, mic_ctx->mic_smpt[i].dma_addr, mic_ctx->mic_smpt[i].ref_count);
+ }
+ spin_unlock_irqrestore(&mic_ctx->smpt_lock, flags);
+ }
+
+ seq_printf(s,
+ "================================================================X\n");
+ return 0;
+}
+#endif
+
+static int smpt_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, smpt_seq_show, inode->i_private);
+}
+
+static int smpt_debug_release(struct inode *inode, struct file *file)
+{
+ return single_release(inode, file);
+}
+
+static struct file_operations smpt_file_ops = {
+ .owner = THIS_MODULE,
+ .open = smpt_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = smpt_debug_release
+};
+
+#ifndef _MIC_SCIF_
+static int log_buf_seq_show(struct seq_file *s, void *pos)
+{
+ uint64_t bid = (uint64_t)s->private;
+ mic_ctx_t *mic_ctx;
+ void *log_buf_len_va, *log_buf_va;
+ struct micscif_dev *dev;
+
+ mic_ctx = get_per_dev_ctx(bid);
+ if (!mic_ctx || !mic_ctx->log_buf_addr || !mic_ctx->log_buf_len)
+ goto done;
+
+ if (mic_ctx->bi_family == FAMILY_ABR) {
+ seq_printf(s, "log buffer display not supported for KNF\n");
+ goto done;
+ }
+
+ dev = &scif_dev[mic_get_scifnode_id(mic_ctx)];
+ log_buf_len_va = virt_to_phys(mic_ctx->log_buf_len) + mic_ctx->aper.va;
+ log_buf_va = virt_to_phys(mic_ctx->log_buf_addr) + mic_ctx->aper.va;
+
+ mutex_lock(&mic_ctx->state_lock);
+ switch (mic_ctx->state) {
+ case MIC_BOOT:
+ case MIC_BOOTFAIL:
+ case MIC_ONLINE:
+ case MIC_SHUTDOWN:
+ case MIC_LOST:
+ micscif_inc_node_refcnt(dev, 1);
+ seq_write(s, log_buf_va, *(int*)log_buf_len_va);
+ micscif_dec_node_refcnt(dev, 1);
+ break;
+ case MIC_NORESPONSE:
+ case MIC_READY:
+ /* Cannot access GDDR while reset is ongoing */
+ case MIC_RESET:
+ case MIC_RESETFAIL:
+ case MIC_INVALID:
+ default:
+ break;
+ }
+ mutex_unlock(&mic_ctx->state_lock);
+done:
+ return 0;
+}
+
+static int log_buf_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, log_buf_seq_show, inode->i_private);
+}
+
+static int log_buf_release(struct inode *inode, struct file *file)
+{
+ return single_release(inode, file);
+}
+
+static struct file_operations log_buf_ops = {
+ .owner = THIS_MODULE,
+ .open = log_buf_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = log_buf_release
+};
+#endif
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+void
+scif_proc_init(void)
+{
+ if ((scif_proc = proc_mkdir("scif", NULL)) != NULL) {
+ proc_create_data("ep", 0444, scif_proc, &scif_ep_fops, NULL);
+ proc_create_data("rma_window", 0444, scif_proc, &scif_rma_window_fops, NULL);
+ proc_create_data("rma_xfer", 0444, scif_proc, &scif_rma_xfer_fops, NULL);
+ proc_create_data("scif_dev", 0444, scif_proc, &scif_dev_fops, NULL);
+ proc_create_data("debug", 0444, scif_proc, &scif_debug_fops, NULL);
+ proc_create_data("suspend", 0444, scif_proc, &scif_suspend_fops, NULL);
+ proc_create("reg_cache_limit", S_IFREG | S_IRUGO | S_IWUGO, scif_proc,
+ &scif_cache_limit_fops);
+ }
+}
+#else
+void
+scif_proc_init(void)
+{
+ struct proc_dir_entry *reg_cache_limit_entry;
+ struct proc_dir_entry *ep_entry;
+
+ if ((scif_proc = create_proc_entry("scif", S_IFDIR | S_IRUGO, NULL)) != NULL) {
+ create_proc_read_entry("rma_window", 0444, scif_proc, scif_rma_window_read, NULL);
+ create_proc_read_entry("rma_xfer", 0444, scif_proc, scif_rma_xfer_read, NULL);
+ create_proc_read_entry("scif_dev", 0444, scif_proc, scif_dev_info, NULL);
+ create_proc_read_entry("debug", 0444, scif_proc, scif_debug_read, NULL);
+ create_proc_read_entry("suspend", 0444, scif_proc, scif_suspend, NULL);
+ create_proc_read_entry("fail_suspend", 0444, scif_proc, scif_fail_suspend, NULL);
+ create_proc_read_entry("resume", 0444, scif_proc, scif_resume, NULL);
+#ifdef _MIC_SCIF_
+ create_proc_read_entry("crash", 0444, scif_proc, scif_crash, NULL);
+ create_proc_read_entry("bugon", 0444, scif_proc, scif_bugon, NULL);
+#endif
+ if ((reg_cache_limit_entry = create_proc_entry("reg_cache_limit", S_IFREG | S_IRUGO | S_IWUGO, scif_proc))) {
+ reg_cache_limit_entry->write_proc = scif_set_reg_cache_limit;
+ reg_cache_limit_entry->read_proc = scif_get_reg_cache_limit;
+ reg_cache_limit_entry->data = NULL;
+ }
+ if ((ep_entry = create_proc_entry("ep", S_IFREG | S_IRUGO | S_IWUGO, scif_proc))) {
+ ep_entry->proc_fops = &scif_ep_fops;
+ }
+
+
+ }
+}
+#endif // LINUX VERSION
+
+#ifdef _MIC_SCIF_
+void
+mic_debug_init(void)
+{
+ if ((mic_debug = debugfs_create_dir("mic_debug", NULL))) {
+ debugfs_create_file("smpt", 0444, mic_debug, NULL, &smpt_file_ops);
+ debugfs_create_u8("enable_msg_logging", 0666, mic_debug, &(ms_info.en_msg_log));
+ }
+}
+#else
+void
+mic_debug_init(mic_ctx_t *mic_ctx)
+{
+ char name[DEBUG_LEN];
+ uint64_t id = mic_ctx->bi_id;
+ struct dentry *child;
+
+ if (!mic_debug)
+ mic_debug = debugfs_create_dir("mic_debug", NULL);
+
+ if (mic_debug) {
+ snprintf(name, DEBUG_LEN, "mic%d", (int)id);
+ if ((child = debugfs_create_dir(name, mic_debug))) {
+ debugfs_create_file("smpt", 0444, child, (void*)id, &smpt_file_ops);
+ debugfs_create_file("log_buf", 0444, child, (void*)id, &log_buf_ops);
+ }
+ debugfs_create_u8("enable_msg_logging", 0666, mic_debug, &(ms_info.en_msg_log));
+ }
+}
+#endif
+
+void
+mic_debug_uninit(void)
+{
+ debugfs_remove_recursive(mic_debug);
+}
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+void
+scif_proc_cleanup(void)
+{
+ if (scif_proc)
+ remove_proc_subtree("scif", NULL);
+}
+#else
+void
+scif_proc_cleanup(void)
+{
+ if (scif_proc) {
+ remove_proc_entry("reg_cache_limit", scif_proc);
+ remove_proc_entry("ep", scif_proc);
+ remove_proc_entry("rma_window", scif_proc);
+ remove_proc_entry("rma_xfer", scif_proc);
+ remove_proc_entry("scif_dev", scif_proc);
+ remove_proc_entry("debug", scif_proc);
+ remove_proc_entry("suspend", scif_proc);
+ remove_proc_entry("fail_suspend", scif_proc);
+ remove_proc_entry("resume", scif_proc);
+#ifdef _MIC_SCIF_
+ remove_proc_entry("crash", scif_proc);
+ remove_proc_entry("bugon", scif_proc);
+#endif
+ remove_proc_entry("scif", NULL);
+ scif_proc = NULL;
+ }
+}
+#endif
+
+#ifdef _MIC_SCIF_
+extern int micscif_max_msg_id;
+
+/*
+ * Test entry point for error injection
+ */
+int
+micscif_error_inject(int scenario)
+{
+ switch (scenario) {
+ case 1:
+ micscif_max_msg_id = 0;
+ break;
+ default:
+ pr_debug("Illegal error injection scenario %d\n", scenario);
+ return -EINVAL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(micscif_error_inject);
+#endif // _MIC_SCIF_
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "mic/micscif.h"
+
+struct mic_priv {
+ scif_epd_t epd;
+};
+
+
+int
+scif_fdopen(struct file *f)
+{
+ struct mic_priv *priv = (struct mic_priv *)
+ kmalloc(sizeof(struct mic_priv), GFP_KERNEL);
+ /*
+ * Not a valid errno as defined in scif.h but should be?
+ */
+ if (!priv)
+ return -ENOMEM;
+
+ /* SCIF device */
+ if (!(priv->epd = __scif_open())) {
+ kfree(priv);
+ return -ENOMEM;
+ }
+
+ ((f)->private_data) = priv;
+ return 0;
+}
+
+int
+scif_fdclose(struct file *f)
+{
+ struct mic_priv *priv = ((f)->private_data);
+ int err = 0;
+
+ /* Only actually request of tear down of end point if file reference
+ * count is greater than 1. This accounts for the fork() issue.
+ */
+ if (atomic64_read(&f->f_count) == 0) {
+ err = __scif_close(priv->epd);
+ kfree(priv);
+ }
+ return err;
+}
+
+int
+micscif_mmap(struct file *f, struct vm_area_struct *vma)
+{
+ struct mic_priv *priv = (struct mic_priv *)((f)->private_data);
+ return scif_mmap(vma, priv->epd);
+}
+
+unsigned int
+micscif_poll(struct file *f, poll_table *wait)
+{
+ struct mic_priv *priv = (struct mic_priv *)((f)->private_data);
+ return __scif_pollfd(f, wait, (struct endpt *)priv->epd);
+}
+
+int
+micscif_flush(struct file *f, fl_owner_t id)
+{
+ struct mic_priv *priv;
+ dev_t dev;
+ struct endpt *ep;
+
+ priv = (struct mic_priv *)f->private_data;
+ dev = f->f_path.dentry->d_inode->i_rdev;
+ if (MINOR(dev) != 1) // SCIF MINOR
+ return 0;
+
+ ep = priv->epd;
+
+ /* Handles fork issue, making suer an endpoint only closes when the original
+ * thread that created it tries to close it, or when there are no more
+ * references to it.
+ */
+ if (ep->files == id)
+ __scif_flush(ep);
+
+ return 0;
+}
+
+
+static __always_inline void
+scif_err_debug(int err, const char *str)
+{
+ /*
+ * ENOTCONN is a common uninteresting error which is
+ * flooding debug messages to the console unnecessarily.
+ */
+ if (err < 0 && err != -ENOTCONN)
+ pr_debug("%s err %d\n", str, err);
+}
+
+
+
+int
+scif_process_ioctl(struct file *f, unsigned int cmd, uint64_t arg)
+{
+ struct mic_priv *priv = ((f)->private_data);
+ void __user *argp = (void __user *)arg;
+ int err = 0;
+ struct scifioctl_msg request;
+ bool non_block = false;
+
+ non_block = !!(f->f_flags & O_NONBLOCK);
+
+ switch (cmd) {
+ case SCIF_BIND:
+ {
+ int pn;
+
+ if (copy_from_user(&pn, argp, sizeof(pn))) {
+ return -EFAULT;
+ }
+
+ if ((pn = __scif_bind(priv->epd, pn)) < 0) {
+ return pn;
+ }
+
+ if (copy_to_user(argp, &pn, sizeof(pn))) {
+ return -EFAULT;
+ }
+
+ return 0;
+ }
+ case SCIF_LISTEN:
+ return __scif_listen(priv->epd, arg);
+ case SCIF_CONNECT:
+ {
+ struct scifioctl_connect req;
+ struct endpt *ep = (struct endpt *)priv->epd;
+
+ if (copy_from_user(&req, argp, sizeof(struct scifioctl_connect))) {
+ return -EFAULT;
+ }
+
+ if ((err = __scif_connect(priv->epd, &req.peer, non_block)) < 0) {
+ return err;
+ }
+
+ req.self.node = ep->port.node;
+ req.self.port = ep->port.port;
+
+ if (copy_to_user(argp, &req, sizeof(struct scifioctl_connect))) {
+ return -EFAULT;
+ }
+
+
+ return 0;
+ }
+ // Accept is done in two halves. Thes request ioctl does the basic functility of accepting
+ // the request and returning the information about it including the internal ID of the
+ // end point. The register is done with the internID on a new file desciptor opened by the
+ // requesting process.
+ case SCIF_ACCEPTREQ:
+ {
+ struct scifioctl_accept request;
+ unsigned long sflags;
+ scif_epd_t *ep = (scif_epd_t *)&request.endpt;
+
+ if (copy_from_user(&request, argp, sizeof(struct scifioctl_accept))) {
+ return -EFAULT;
+ }
+
+ if ((err = __scif_accept(priv->epd, &request.peer, ep, request.flags)) < 0) {
+ return err;
+ }
+
+ if (copy_to_user(argp, &request, sizeof(struct scifioctl_accept))) {
+ scif_close(*ep);
+ return -EFAULT;
+ }
+
+ // Add to the list of user mode eps where the second half of the accept
+ // is not yet completed.
+ spin_lock_irqsave(&ms_info.mi_eplock, sflags);
+ list_add_tail(&((*ep)->miacceptlist), &ms_info.mi_uaccept);
+ list_add_tail(&((*ep)->liacceptlist), &priv->epd->li_accept);
+ (*ep)->listenep = priv->epd;
+ priv->epd->acceptcnt++;
+ spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
+
+ return 0;
+ }
+ case SCIF_ACCEPTREG:
+ {
+ struct mic_priv *priv = (struct mic_priv *)((f)->private_data);
+ struct endpt *newep;
+ struct endpt *lisep;
+ struct endpt *ep;
+ struct endpt *fep = NULL;
+ struct endpt *tmpep;
+ struct list_head *pos, *tmpq;
+ unsigned long sflags;
+
+ // Finally replace the pointer to the accepted endpoint
+ if (copy_from_user(&newep, argp, sizeof(void *)))
+ return -EFAULT;
+
+ // Remove form the user accept queue
+ spin_lock_irqsave(&ms_info.mi_eplock, sflags);
+ list_for_each_safe(pos, tmpq, &ms_info.mi_uaccept) {
+ tmpep = list_entry(pos, struct endpt, miacceptlist);
+ if (tmpep == newep) {
+ list_del(pos);
+ fep = tmpep;
+ break;
+ }
+ }
+
+ if (fep == NULL) {
+ spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
+ return -ENOENT;
+ }
+
+ lisep = newep->listenep;
+ list_for_each_safe(pos, tmpq, &lisep->li_accept) {
+ tmpep = list_entry(pos, struct endpt, liacceptlist);
+ if (tmpep == newep) {
+ list_del(pos);
+ lisep->acceptcnt--;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
+
+ // Free the resources automatically created from the open.
+ micscif_teardown_ep(priv->epd);
+ micscif_add_epd_to_zombie_list(priv->epd, !MI_EPLOCK_HELD);
+ priv->epd = newep;
+ ep = (struct endpt *)priv->epd;
+ ep = ep;
+ return 0;
+ }
+ case SCIF_SEND:
+ {
+ struct mic_priv *priv = (struct mic_priv *)((f)->private_data);
+
+ if (copy_from_user(&request, argp,
+ sizeof(struct scifioctl_msg))) {
+ err = -EFAULT;
+ goto send_err;
+ }
+
+ if ((err = scif_user_send(priv->epd, request.msg,
+ request.len, request.flags)) < 0)
+ goto send_err;
+
+ if (copy_to_user(&((struct scifioctl_msg*)argp)->out_len,
+ &err, sizeof(err))) {
+ err = -EFAULT;
+ goto send_err;
+ }
+ err = 0;
+send_err:
+ scif_err_debug(err, "scif_send");
+ return err;
+ }
+ case SCIF_RECV:
+ {
+ struct mic_priv *priv = (struct mic_priv *)((f)->private_data);
+
+ if (copy_from_user(&request, argp,
+ sizeof(struct scifioctl_msg))) {
+ err = -EFAULT;
+ goto recv_err;
+ }
+
+ if ((err = scif_user_recv(priv->epd, request.msg,
+ request.len, request.flags)) < 0)
+ goto recv_err;
+
+ if (copy_to_user(&((struct scifioctl_msg*)argp)->out_len,
+ &err, sizeof(err))) {
+ err = -EFAULT;
+ goto recv_err;
+ }
+ err = 0;
+recv_err:
+ scif_err_debug(err, "scif_recv");
+ return err;
+ }
+ case SCIF_REG:
+ {
+ struct mic_priv *priv = (struct mic_priv *)((f)->private_data);
+ struct scifioctl_reg reg;
+ off_t ret;
+
+ if (copy_from_user(®, argp, sizeof(reg))) {
+ err = -EFAULT;
+ goto reg_err;
+ }
+ if (reg.flags & SCIF_MAP_KERNEL) {
+ err = -EINVAL;
+ goto reg_err;
+ }
+ if ((ret = __scif_register(priv->epd, reg.addr, reg.len,
+ reg.offset, reg.prot, reg.flags)) < 0) {
+ err = (int)ret;
+ goto reg_err;
+ }
+
+ if (copy_to_user(&((struct scifioctl_reg*)argp)->out_offset,
+ &ret, sizeof(reg.out_offset))) {
+ err = -EFAULT;
+ goto reg_err;
+ }
+ err = 0;
+reg_err:
+ scif_err_debug(err, "scif_register");
+ return err;
+ }
+ case SCIF_UNREG:
+ {
+ struct mic_priv *priv = (struct mic_priv *)((f)->private_data);
+ struct scifioctl_unreg unreg;
+
+ if (copy_from_user(&unreg, argp, sizeof(unreg))) {
+ err = -EFAULT;
+ goto unreg_err;
+ }
+ err = __scif_unregister(priv->epd, unreg.offset, unreg.len);
+unreg_err:
+ scif_err_debug(err, "scif_unregister");
+ return err;
+ }
+ case SCIF_READFROM:
+ {
+ struct mic_priv *priv = (struct mic_priv *)((f)->private_data);
+ struct scifioctl_copy copy;
+
+ if (copy_from_user(©, argp, sizeof(copy))) {
+ err = -EFAULT;
+ goto readfrom_err;
+ }
+ err = __scif_readfrom(priv->epd,
+ copy.loffset,
+ copy.len,
+ copy.roffset,
+ copy.flags);
+readfrom_err:
+ scif_err_debug(err, "scif_readfrom");
+ return err;
+ }
+ case SCIF_WRITETO:
+ {
+ struct mic_priv *priv = (struct mic_priv *)((f)->private_data);
+ struct scifioctl_copy copy;
+
+ if (copy_from_user(©, argp, sizeof(copy))) {
+ err = -EFAULT;
+ goto writeto_err;
+ }
+ err = __scif_writeto(priv->epd,
+ copy.loffset,
+ copy.len,
+ copy.roffset,
+ copy.flags);
+writeto_err:
+ scif_err_debug(err, "scif_writeto");
+ return err;
+ }
+ case SCIF_VREADFROM:
+ {
+ struct mic_priv *priv = (struct mic_priv *)((f)->private_data);
+ struct scifioctl_copy copy;
+
+ if (copy_from_user(©, argp, sizeof(copy))) {
+ err = -EFAULT;
+ goto vreadfrom_err;
+ }
+ err = __scif_vreadfrom(priv->epd,
+ copy.addr,
+ copy.len,
+ copy.roffset,
+ copy.flags);
+vreadfrom_err:
+ scif_err_debug(err, "scif_vreadfrom");
+ return err;
+ }
+ case SCIF_VWRITETO:
+ {
+ struct mic_priv *priv = (struct mic_priv *)((f)->private_data);
+ struct scifioctl_copy copy;
+
+ if (copy_from_user(©, argp, sizeof(copy))) {
+ err = -EFAULT;
+ goto vwriteto_err;
+ }
+ err = __scif_vwriteto(priv->epd,
+ copy.addr,
+ copy.len,
+ copy.roffset,
+ copy.flags);
+vwriteto_err:
+ scif_err_debug(err, "scif_vwriteto");
+ return err;
+ }
+ case SCIF_GET_NODEIDS:
+ {
+ struct scifioctl_nodeIDs nodeIDs;
+ int entries;
+ uint16_t *nodes;
+ uint16_t self;
+
+ if (copy_from_user(&nodeIDs, argp, sizeof(nodeIDs))) {
+ err = -EFAULT;
+ goto getnodes_err2;
+ }
+
+ entries = SCIF_MIN(MAX_BOARD_SUPPORTED, nodeIDs.len);
+
+ nodes = kmalloc(sizeof(uint16_t) * entries, GFP_KERNEL);
+ if ( (entries != 0) && (!nodes) ){
+ err = -ENOMEM;
+ goto getnodes_err2;
+ }
+ nodeIDs.len = scif_get_nodeIDs(nodes, entries, &self);
+
+ if (copy_to_user(nodeIDs.nodes,
+ nodes, sizeof(uint16_t) * entries)) {
+ err = -EFAULT;
+ goto getnodes_err1;
+ }
+
+ if (copy_to_user(nodeIDs.self,
+ &self, sizeof(uint16_t))) {
+ err = -EFAULT;
+ goto getnodes_err1;
+ }
+
+ if (copy_to_user(argp, &nodeIDs, sizeof(nodeIDs))) {
+ err = -EFAULT;
+ goto getnodes_err1;
+ }
+getnodes_err1:
+ kfree(nodes);
+getnodes_err2:
+ return err;
+ }
+ case SCIF_FENCE_MARK:
+ {
+ struct mic_priv *priv = (struct mic_priv *)((f)->private_data);
+ struct scifioctl_fence_mark mark;
+ int tmp_mark = 0;
+
+ if (copy_from_user(&mark, argp, sizeof(mark))) {
+ err = -EFAULT;
+ goto fence_mark_err;
+ }
+ if ((err = __scif_fence_mark(priv->epd,
+ mark.flags, &tmp_mark)))
+ goto fence_mark_err;
+ if (copy_to_user(mark.mark, &tmp_mark, sizeof(tmp_mark))) {
+ err = -EFAULT;
+ goto fence_mark_err;
+ }
+fence_mark_err:
+ scif_err_debug(err, "scif_fence_mark");
+ return err;
+ }
+ case SCIF_FENCE_WAIT:
+ {
+ struct mic_priv *priv = (struct mic_priv *)((f)->private_data);
+ err = __scif_fence_wait(priv->epd, arg);
+ scif_err_debug(err, "scif_fence_wait");
+ return err;
+ }
+ case SCIF_FENCE_SIGNAL:
+ {
+ struct mic_priv *priv = (struct mic_priv *)((f)->private_data);
+ struct scifioctl_fence_signal signal;
+
+ if (copy_from_user(&signal, argp, sizeof(signal))) {
+ err = -EFAULT;
+ goto fence_signal_err;
+ }
+
+ err = __scif_fence_signal(priv->epd, signal.loff,
+ signal.lval, signal.roff, signal.rval, signal.flags);
+fence_signal_err:
+ scif_err_debug(err, "scif_fence_signal");
+ return err;
+ }
+ case SCIF_GET_VERSION:
+ {
+ return SCIF_VERSION;
+ }
+ }
+ return -EINVAL;
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "mic/micscif.h"
+#include "mic/micscif_intr.h"
+#include "mic/micscif_nodeqp.h"
+#include "mic_common.h"
+
+/* Runs in the context of sd_intr_wq */
+static void micscif_intr_bh_handler(struct work_struct *work)
+{
+ struct micscif_dev *scifdev =
+ container_of(work, struct micscif_dev, sd_intr_bh);
+
+ /* figure out which qp we got a recv on */
+ struct micscif_qp *qp = micscif_nodeqp_nextmsg(scifdev);
+ if (qp != NULL) {
+ if (is_self_scifdev(scifdev))
+ micscif_loopb_msg_handler(scifdev, qp);
+ else
+ micscif_nodeqp_intrhandler(scifdev, qp);
+ }
+}
+
+int micscif_setup_interrupts(struct micscif_dev *scifdev)
+{
+ if (!scifdev->sd_intr_wq) {
+ snprintf(scifdev->sd_intr_wqname, sizeof(scifdev->sd_intr_wqname),
+ "SCIF INTR %d", scifdev->sd_node);
+
+ /* FIXME: Fix windows */
+ if (!(scifdev->sd_intr_wq =
+ __mic_create_singlethread_workqueue(scifdev->sd_intr_wqname)))
+ return -ENOMEM;
+
+ INIT_WORK(&scifdev->sd_intr_bh, micscif_intr_bh_handler);
+ }
+ return 0;
+}
+
+void micscif_destroy_interrupts(struct micscif_dev *scifdev)
+{
+ destroy_workqueue(scifdev->sd_intr_wq);
+}
+
+#ifdef _MIC_SCIF_
+irqreturn_t micscif_intr_handler(int irq, void *dev_id)
+{
+ struct micscif_dev *dev = (struct micscif_dev *)dev_id;
+ queue_work(dev->sd_intr_wq, &dev->sd_intr_bh);
+ return IRQ_HANDLED;
+}
+
+/*
+ * register_scif_intr_handler() - Registers SCIF interrupt handler with
+ * appropriate IRQ
+ * @dev: per node dev structure to store the intr handle
+ *
+ * IRQ 17 - 24 Corresponds to RDMASR registers RDMASR0 - RRDMASR7.
+ * RDMASR registers are chosen based on the lowest ref count.
+ * There are 8 RDMASRS for the host and the nodes. So When the number of
+ * nodes added to the current node's p2p network increases beyond
+ * 7, it starts sharing the interrupt.
+ */
+int
+register_scif_intr_handler(struct micscif_dev *dev)
+{
+ unsigned int handle = 0;
+ unsigned int i;
+ int ret;
+
+ mutex_lock(&ms_info.mi_conflock);
+
+ /* Find the first lowest ref count */
+ for (i = 0; i < MAX_RDMASR; i++)
+ if (ms_info.mi_intr_rcnt[handle] >
+ ms_info.mi_intr_rcnt[i])
+ handle = i;
+
+ if ((ret = request_irq(get_rdmasr_irq(handle), micscif_intr_handler,
+ IRQF_SHARED, dev->sd_intr_wqname, dev))) {
+ printk(KERN_ERR "Cannot request irq number %d, ret = %d\n"
+ , get_rdmasr_irq(handle), ret);
+ goto error;
+ }
+
+ ms_info.mi_intr_rcnt[handle]++;
+ dev->sd_intr_handle = handle;
+
+ printk("Registered interrupt handler for node %d, for IRQ = %d,"
+ "handle = %d\n", dev->sd_node, get_rdmasr_irq(handle), handle);
+
+error:
+ mutex_unlock(&ms_info.mi_conflock);
+ return ret;
+}
+
+/*
+ * deregister_scif_intr_handler() - Deregisters SCIF interrupt
+ * handler from appropriate IRQ
+ * @dev: per node dev structure to retrieve the intr handle
+ *
+ */
+void
+deregister_scif_intr_handler(struct micscif_dev *dev)
+{
+ unsigned int handle = dev->sd_intr_handle;
+
+ if (handle >= MAX_RDMASR)
+ return;
+
+ mutex_lock(&ms_info.mi_conflock);
+ ms_info.mi_intr_rcnt[handle]--;
+
+ if (ms_info.mi_intr_rcnt[handle] < 0) {
+ printk("scif intr deregister negative ref count"
+ " for node %d, handle = %d, IRQ = %d\n", dev->sd_node,
+ handle, get_rdmasr_irq(handle));
+ WARN_ON(1);
+ }
+
+ mutex_unlock(&ms_info.mi_conflock);
+ free_irq(get_rdmasr_irq(handle), dev);
+ printk("Deregistered interrupt handler for node %d, for IRQ = %d,"
+ "handle = %d\n", dev->sd_node, get_rdmasr_irq(handle), handle);
+}
+#endif /* _MIC_SCIF_ */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <linux/cdev.h>
+#include <linux/reboot.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,34))
+#include <linux/pm_qos_params.h>
+#endif
+
+#include <mic/micscif.h>
+#include <mic/micscif_smpt.h>
+#include <mic/micscif_rb.h>
+#include <mic/micscif_intr.h>
+//#include <micscif_test.h>
+#include <mic/micscif_nodeqp.h>
+#include <mic/mic_dma_api.h>
+#include <mic/micscif_kmem_cache.h>
+/* Include this for suspend/resume notifications from pm driver */
+#include <mic/micscif_nm.h>
+
+#ifdef CONFIG_MK1OM
+#define MICPM_DEVEVENT_SUSPEND 1
+#define MICPM_DEVEVENT_RESUME 2
+#define MICPM_DEVEVENT_FAIL_SUSPEND 3
+extern void micpm_device_register(struct notifier_block *n);
+extern void micpm_device_unregister(struct notifier_block *n);
+#endif
+
+int scif_id = 0;
+module_param(scif_id, int, 0400);
+MODULE_PARM_DESC(scif_id, "Set scif driver node ID");
+
+ulong scif_addr = 0;
+module_param(scif_addr, ulong, 0400);
+MODULE_PARM_DESC(scif_addr, "Set scif driver host address");
+
+struct kmem_cache *unaligned_cache;
+
+struct mic_info {
+ dev_t m_dev;
+ struct cdev m_cdev;
+ struct class * m_class;
+ struct device * m_scifdev;
+} micinfo;
+
+int micscif_major = SCIF_MAJOR;
+int micscif_minor = 0;
+
+struct micscif_info ms_info;
+
+// MAX MIC cards + 1 for the Host
+struct micscif_dev scif_dev[MAX_BOARD_SUPPORTED + 1];
+
+extern mic_dma_handle_t mic_dma_handle;
+
+static int mic_pm_qos_cpu_dma_lat = -1;
+static int mic_host_numa_node = -1;
+static unsigned long mic_p2p_proxy_thresh = -1;
+
+#ifdef CONFIG_MK1OM
+static int micscif_devevent_handler(struct notifier_block *nb,
+ unsigned long event,
+ void *msg)
+{
+ if (event == MICPM_DEVEVENT_SUSPEND)
+ return micscif_suspend_handler(nb, event, msg);
+ else if (event == MICPM_DEVEVENT_RESUME)
+ return micscif_resume_handler(nb, event, msg);
+ else if (event == MICPM_DEVEVENT_FAIL_SUSPEND)
+ return micscif_fail_suspend_handler(nb, event, msg);
+ return 0;
+}
+
+static struct notifier_block mic_deviceevent = {
+ .notifier_call = micscif_devevent_handler,
+};
+#endif
+
+static int micscif_open(struct inode *in, struct file *f)
+{
+ dev_t dev = in->i_rdev;
+
+ switch (MINOR(dev)) {
+ case 0:
+ /* base mic device access for testing */
+ return 0;
+ case 1:
+ return scif_fdopen(f);
+ }
+
+ return -EINVAL;
+}
+
+static int micscif_ioctl(struct inode *in, struct file *f,
+ unsigned int cmd, unsigned long arg)
+{
+ dev_t dev = in->i_rdev;
+
+ if (MINOR(dev) == 1) {
+ /* SCIF device */
+ return scif_process_ioctl(f, cmd, arg);
+ }
+ return -EINVAL;
+}
+
+static long micscif_unlocked_ioctl(struct file *f,
+ unsigned int cmd, unsigned long arg)
+{
+ return (long) micscif_ioctl(f->f_path.dentry->d_inode, f, cmd, arg);
+}
+
+static int micscif_release(struct inode *in, struct file *f)
+{
+ dev_t dev = in->i_rdev;
+
+ switch (MINOR(dev)) {
+ case 0:
+ /* base mic device access for testing */
+ return 0;
+ case 1:
+ return scif_fdclose(f);
+ }
+
+ return -EINVAL;
+}
+
+/* TODO: Need to flush the queue, grab some lock, and probably
+ * notify the remote node we're going down ... right now, we're
+ * just freeing things, which is probably a bad idea :-)
+ */
+static int micscif_uninit_qp(struct micscif_dev *scifdev)
+{
+ int i;
+ /* first, iounmap/unmap/free any memory we mapped */
+ for (i = 0; i < scifdev->n_qpairs; i++) {
+ iounmap(scifdev->qpairs[i].remote_qp);
+ iounmap(scifdev->qpairs[i].outbound_q.rb_base);
+ kfree((void *)scifdev->qpairs[i].inbound_q.rb_base);
+ }
+ kfree(scifdev->qpairs);
+ scifdev->n_qpairs = 0;
+
+ return 0;
+}
+
+static int micscif_reboot(struct notifier_block *notifier, unsigned long unused1, void *unused2);
+
+static struct notifier_block micscif_reboot_notifier = {
+ .notifier_call = micscif_reboot,
+ .priority = 0,
+};
+
+extern struct attribute_group scif_attr_group;
+
+void micscif_destroy_base(void)
+{
+#ifdef CONFIG_MMU_NOTIFIER
+ destroy_workqueue(ms_info.mi_mmu_notif_wq);
+#endif
+ destroy_workqueue(ms_info.mi_misc_wq);
+ destroy_workqueue(ms_info.mi_conn_wq);
+
+ sysfs_remove_group(&micinfo.m_scifdev->kobj, &scif_attr_group);
+ device_destroy(micinfo.m_class, micinfo.m_dev + 1);
+ device_destroy(micinfo.m_class, micinfo.m_dev);
+ class_destroy(micinfo.m_class);
+ cdev_del(&(micinfo.m_cdev));
+ unregister_chrdev_region(micinfo.m_dev, 2);
+}
+
+static void _micscif_exit(void)
+{
+ struct list_head *pos, *unused;
+ struct scif_callback *temp;
+ struct micscif_dev *dev;
+ int i;
+
+ pr_debug("Goodbye SCIF!\n");
+ /* Cleanup P2P Node Qp/ Interrupt Handlers */
+ for (i = SCIF_HOST_NODE + 1; i <= MAX_BOARD_SUPPORTED; i++) {
+ dev = &scif_dev[i];
+
+ if (is_self_scifdev(dev))
+ continue;
+
+ micscif_cleanup_scifdev(dev, DESTROY_WQ);
+ }
+
+ list_for_each_safe(pos, unused, &ms_info.mi_event_cb) {
+ temp = list_entry(pos, struct scif_callback, list_member);
+ list_del(pos);
+ kfree(temp);
+ }
+ mutex_destroy(&ms_info.mi_event_cblock);
+
+#ifdef CONFIG_MK1OM
+ micpm_device_unregister(&mic_deviceevent);
+#endif
+
+ scif_dev[ms_info.mi_nodeid].sd_state = SCIFDEV_STOPPING;
+ scif_dev[SCIF_HOST_NODE].sd_state = SCIFDEV_STOPPING;
+
+ /* The EXIT message is the last message from MIC to the Host */
+ micscif_send_exit();
+
+ /*
+ * Deliberate infinite wait for a host response during driver
+ * unload since the host must inform other SCIF nodes about
+ * this node going away and then only send a response back
+ * to this node to avoid this nodes host shutdown handler racing
+ * with disconnection from the SCIF network. There is a timeout
+ * on the host for sending a response back so a response will
+ * be sent else the host has crashed.
+ */
+ wait_event(ms_info.mi_exitwq,
+ scif_dev[ms_info.mi_nodeid].sd_state == SCIFDEV_STOPPED);
+ scif_proc_cleanup();
+ mic_debug_uninit();
+ micscif_kmem_cache_destroy();
+
+ micscif_destroy_base();
+
+ /* Disable interrupts */
+ deregister_scif_intr_handler(&scif_dev[SCIF_HOST_NODE]);
+ destroy_workqueue(scif_dev[SCIF_HOST_NODE].sd_intr_wq);
+ micscif_destroy_loopback_qp(&scif_dev[ms_info.mi_nodeid]);
+
+ /* Close DMA device */
+ close_dma_device(0, &mic_dma_handle);
+
+ micscif_uninit_qp(&scif_dev[SCIF_HOST_NODE]);
+ iounmap(scif_dev[SCIF_HOST_NODE].mm_sbox);
+#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,34))
+ pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, "micscif");
+#endif
+}
+
+static void micscif_exit(void)
+{
+ unregister_reboot_notifier(&micscif_reboot_notifier);
+ _micscif_exit();
+}
+
+static int micscif_reboot(struct notifier_block *notifier, unsigned long unused1, void *unused2)
+{
+ _micscif_exit();
+ return NOTIFY_OK;
+}
+
+struct file_operations micscif_ops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = micscif_unlocked_ioctl,
+ .mmap = micscif_mmap,
+ .poll = micscif_poll,
+ .flush = micscif_flush,
+ .open = micscif_open,
+ .release = micscif_release,
+};
+
+static char * scif_devnode(struct device *dev, mode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "mic/%s", dev_name(dev));
+}
+
+// Setup the base informaiton for the driver. No interface specific code.
+static int micscif_setup_base(void)
+{
+ long int result;
+
+ if (micscif_major) {
+ micinfo.m_dev = MKDEV(micscif_major, micscif_minor);
+ result = register_chrdev_region(micinfo.m_dev, 2, "micscif");
+ } else {
+ result = alloc_chrdev_region(&micinfo.m_dev, micscif_minor, 2, "micscif");
+ micscif_major = MAJOR(micinfo.m_dev);
+ }
+
+ if (result >= 0) {
+ cdev_init(&(micinfo.m_cdev), &micscif_ops);
+ micinfo.m_cdev.owner = THIS_MODULE;
+ if ((result = cdev_add(&(micinfo.m_cdev), micinfo.m_dev, 2)))
+ goto unreg_chrdev;
+ } else {
+ goto unreg_chrdev;
+ }
+
+ micinfo.m_class = class_create(THIS_MODULE, "micscif");
+ if (IS_ERR(micinfo.m_class)) {
+ result = PTR_ERR(micinfo.m_class);
+ goto del_m_dev;
+ }
+
+ micinfo.m_class->devnode = scif_devnode;
+ if (IS_ERR((int *)(result =
+ (long int)device_create(micinfo.m_class, NULL, micinfo.m_dev, NULL, "mic")))) {
+ result = PTR_ERR((int *)result);
+ goto class_destroy;
+ }
+ if (IS_ERR(micinfo.m_scifdev =
+ device_create(micinfo.m_class, NULL, micinfo.m_dev + 1, NULL, "scif"))) {
+ result = PTR_ERR(micinfo.m_scifdev);
+ goto device_destroy;
+ }
+ if ((result = sysfs_create_group(&micinfo.m_scifdev->kobj, &scif_attr_group)))
+ goto device_destroy1;
+
+ spin_lock_init(&ms_info.mi_eplock);
+ spin_lock_init(&ms_info.mi_connlock);
+ spin_lock_init(&ms_info.mi_rmalock);
+ mutex_init(&ms_info.mi_fencelock);
+ spin_lock_init(&ms_info.mi_nb_connect_lock);
+ INIT_LIST_HEAD(&ms_info.mi_uaccept);
+ INIT_LIST_HEAD(&ms_info.mi_listen);
+ INIT_LIST_HEAD(&ms_info.mi_zombie);
+ INIT_LIST_HEAD(&ms_info.mi_connected);
+ INIT_LIST_HEAD(&ms_info.mi_disconnected);
+ INIT_LIST_HEAD(&ms_info.mi_rma);
+ INIT_LIST_HEAD(&ms_info.mi_rma_tc);
+ INIT_LIST_HEAD(&ms_info.mi_nb_connect_list);
+
+#ifdef CONFIG_MMU_NOTIFIER
+ INIT_LIST_HEAD(&ms_info.mi_mmu_notif_cleanup);
+#endif
+ INIT_LIST_HEAD(&ms_info.mi_fence);
+ if (!(ms_info.mi_misc_wq = create_singlethread_workqueue("SCIF_MISC"))) {
+ result = -ENOMEM;
+ goto remove_group;
+ }
+ INIT_WORK(&ms_info.mi_misc_work, micscif_misc_handler);
+ if (!(ms_info.mi_conn_wq = create_singlethread_workqueue("SCIF_NB_CONN"))) {
+ result = -ENOMEM;
+ goto destroy_misc_wq;
+ }
+ INIT_WORK(&ms_info.mi_conn_work, micscif_conn_handler);
+#ifdef CONFIG_MMU_NOTIFIER
+ if (!(ms_info.mi_mmu_notif_wq = create_singlethread_workqueue("SCIF_MMU"))) {
+ result = -ENOMEM;
+ goto destroy_conn_wq;
+ }
+ INIT_WORK(&ms_info.mi_mmu_notif_work, micscif_mmu_notif_handler);
+#endif
+ ms_info.mi_watchdog_to = DEFAULT_WATCHDOG_TO;
+#ifdef MIC_IS_EMULATION
+ ms_info.mi_watchdog_enabled = 0;
+#else
+ ms_info.mi_watchdog_enabled = 1;
+#endif
+ ms_info.mi_rma_tc_limit = SCIF_RMA_TEMP_CACHE_LIMIT;
+ ms_info.mi_proxy_dma_threshold = mic_p2p_proxy_thresh;
+ ms_info.en_msg_log = 0;
+ return result;
+#ifdef CONFIG_MMU_NOTIFIER
+destroy_conn_wq:
+ destroy_workqueue(ms_info.mi_conn_wq);
+#endif
+destroy_misc_wq:
+ destroy_workqueue(ms_info.mi_misc_wq);
+remove_group:
+ sysfs_remove_group(&micinfo.m_scifdev->kobj, &scif_attr_group);
+device_destroy1:
+ device_destroy(micinfo.m_class, micinfo.m_dev + 1);
+device_destroy:
+ device_destroy(micinfo.m_class, micinfo.m_dev);
+class_destroy:
+ class_destroy(micinfo.m_class);
+del_m_dev:
+ cdev_del(&(micinfo.m_cdev));
+unreg_chrdev:
+ unregister_chrdev_region(micinfo.m_dev, 2);
+//error:
+ return result;
+}
+
+#define SBOX_MMIO_LENGTH 0x10000
+
+static int micscif_init(void)
+{
+ int result = 0;
+ int i;
+ phys_addr_t host_queue_phys;
+ phys_addr_t gtt_phys_base;
+
+ pr_debug("HELLO SCIF!\n");
+
+#if defined(CONFIG_ML1OM)
+ pr_debug("micscif_init(): Hello KNF!\n");
+#elif defined(CONFIG_MK1OM)
+ pr_debug("micscif_init(): Hello KNC!\n");
+#endif
+
+ if (!scif_id || !scif_addr) {
+ printk(KERN_ERR "%s %d scif_id 0x%x scif_addr 0x%lx"
+ "not provided as module parameter. Fail module load",
+ __func__, __LINE__, scif_id, scif_addr);
+ return -EINVAL;
+ }
+
+ for (i = 1; i <= MAX_BOARD_SUPPORTED; i++) {
+ scif_dev[i].sd_state = SCIFDEV_INIT;
+ scif_dev[i].sd_node = i;
+ scif_dev[i].sd_numa_node = -1;
+ mutex_init (&scif_dev[i].sd_lock);
+ init_waitqueue_head(&scif_dev[i].sd_mmap_wq);
+ init_waitqueue_head(&scif_dev[i].sd_wq);
+ init_waitqueue_head(&scif_dev[i].sd_p2p_wq);
+ INIT_DELAYED_WORK(&scif_dev[i].sd_p2p_dwork,
+ scif_poll_qp_state);
+ scif_dev[i].sd_p2p_retry = 0;
+ }
+
+ // Setup the host node access information
+ // Initially only talks to the host => node 0
+ scif_dev[SCIF_HOST_NODE].sd_node = SCIF_HOST_NODE;
+ scif_dev[SCIF_HOST_NODE].sd_state = SCIFDEV_RUNNING;
+ if (!(scif_dev[SCIF_HOST_NODE].mm_sbox =
+ ioremap_nocache(SBOX_BASE, SBOX_MMIO_LENGTH))) {
+ result = -ENOMEM;
+ goto error;
+ }
+ scif_dev[SCIF_HOST_NODE].scif_ref_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0);
+ scif_dev[SCIF_HOST_NODE].scif_map_ref_cnt = 0;
+ init_waitqueue_head(&scif_dev[SCIF_HOST_NODE].sd_wq);
+ init_waitqueue_head(&scif_dev[SCIF_HOST_NODE].sd_mmap_wq);
+ mutex_init(&scif_dev[SCIF_HOST_NODE].sd_lock);
+ gtt_phys_base = readl(scif_dev[SCIF_HOST_NODE].mm_sbox + SBOX_GTT_PHY_BASE);
+ gtt_phys_base *= ((4) * 1024);
+ pr_debug("GTT PHY BASE in GDDR 0x%llx\n", gtt_phys_base);
+ pr_debug("micscif_init(): gtt_phy_base x%llx\n", gtt_phys_base);
+
+ /* Get handle to DMA device */
+ if ((result = open_dma_device(0, 0, &mic_dma_handle)))
+ goto unmap_sbox;
+
+ ms_info.mi_nodeid = scif_id;
+ ms_info.mi_maxid = scif_id;
+ ms_info.mi_total = 2; // Host plus this card
+
+#ifdef RMA_DEBUG
+ ms_info.rma_unaligned_cpu_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0);
+ ms_info.rma_alloc_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0);
+ ms_info.rma_pin_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0);
+#ifdef CONFIG_MMU_NOTIFIER
+ ms_info.mmu_notif_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0);
+#endif
+#endif
+
+ pr_debug("micscif_init(): setup_card_qp \n");
+ host_queue_phys = scif_addr;
+ mutex_init(&ms_info.mi_event_cblock);
+ mutex_init(&ms_info.mi_conflock);
+ INIT_LIST_HEAD(&ms_info.mi_event_cb);
+
+ pr_debug("micscif_init(): setup_interrupts \n");
+ /*
+ * Set up the workqueue thread for interrupt handling
+ */
+ if ((result = micscif_setup_interrupts(&scif_dev[SCIF_HOST_NODE])))
+ goto close_dma;
+
+ pr_debug("micscif_init(): host_intr_handler \n");
+ if ((result = micscif_setup_card_qp(host_queue_phys, &scif_dev[SCIF_HOST_NODE]))) {
+ if (result == -ENXIO)
+ goto uninit_qp;
+ else
+ goto destroy_intr_wq;
+ }
+ /* need to do this last -- as soon as the dev is setup, userspace
+ * can try to use the device
+ */
+ pr_debug("micscif_init(): setup_base \n");
+ if ((result = micscif_setup_base()))
+ goto uninit_qp;
+ /*
+ * Register the interrupt
+ */
+ if ((result = register_scif_intr_handler(&scif_dev[SCIF_HOST_NODE])))
+ goto destroy_base;
+
+ // Setup information for self aka loopback.
+ scif_dev[ms_info.mi_nodeid].sd_node = ms_info.mi_nodeid;
+ scif_dev[ms_info.mi_nodeid].sd_numa_node = mic_host_numa_node;
+ scif_dev[ms_info.mi_nodeid].mm_sbox = scif_dev[SCIF_HOST_NODE].mm_sbox;
+ scif_dev[ms_info.mi_nodeid].scif_ref_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0);
+ scif_dev[ms_info.mi_nodeid].scif_map_ref_cnt = 0;
+ init_waitqueue_head(&scif_dev[ms_info.mi_nodeid].sd_wq);
+ init_waitqueue_head(&scif_dev[ms_info.mi_nodeid].sd_mmap_wq);
+ mutex_init(&scif_dev[ms_info.mi_nodeid].sd_lock);
+ if ((result = micscif_setup_loopback_qp(&scif_dev[ms_info.mi_nodeid])))
+ goto dereg_intr_handle;
+ scif_dev[ms_info.mi_nodeid].sd_state = SCIFDEV_RUNNING;
+
+ unaligned_cache = micscif_kmem_cache_create();
+ if (!unaligned_cache) {
+ result = -ENOMEM;
+ goto destroy_loopb;
+ }
+ scif_proc_init();
+ mic_debug_init();
+
+ pr_debug("micscif_init(): Setup successful: 0x%llx \n", host_queue_phys);
+
+#ifdef CONFIG_MK1OM
+ micpm_device_register(&mic_deviceevent);
+#endif
+ if ((result = register_reboot_notifier(&micscif_reboot_notifier)))
+ goto cache_destroy;
+
+#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,34))
+ result = pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, "micscif", mic_pm_qos_cpu_dma_lat);
+ if (result) {
+ printk("%s %d mic_pm_qos_cpu_dma_lat %d result %d\n",
+ __func__, __LINE__, mic_pm_qos_cpu_dma_lat, result);
+ result = 0;
+ /* Dont fail driver load due to PM QoS API. Fall through */
+ }
+#endif
+
+ return result;
+cache_destroy:
+#ifdef CONFIG_MK1OM
+ micpm_device_unregister(&mic_deviceevent);
+#endif
+ micscif_kmem_cache_destroy();
+destroy_loopb:
+ micscif_destroy_loopback_qp(&scif_dev[ms_info.mi_nodeid]);
+dereg_intr_handle:
+ deregister_scif_intr_handler(&scif_dev[SCIF_HOST_NODE]);
+destroy_base:
+ pr_debug("Unable to finish scif setup for some reason: %d\n", result);
+ micscif_destroy_base();
+uninit_qp:
+ micscif_uninit_qp(&scif_dev[SCIF_HOST_NODE]);
+destroy_intr_wq:
+ micscif_destroy_interrupts(&scif_dev[SCIF_HOST_NODE]);
+close_dma:
+ close_dma_device(0, &mic_dma_handle);
+unmap_sbox:
+ iounmap(scif_dev[SCIF_HOST_NODE].mm_sbox);
+error:
+ return result;
+}
+
+module_init(micscif_init);
+module_exit(micscif_exit);
+
+module_param_named(huge_page, mic_huge_page_enable, bool, 0600);
+MODULE_PARM_DESC(huge_page, "SCIF Huge Page Support");
+
+module_param_named(ulimit, mic_ulimit_check, bool, 0600);
+MODULE_PARM_DESC(ulimit, "SCIF ulimit check");
+
+module_param_named(reg_cache, mic_reg_cache_enable, bool, 0600);
+MODULE_PARM_DESC(reg_cache, "SCIF registration caching");
+module_param_named(p2p, mic_p2p_enable, bool, 0600);
+MODULE_PARM_DESC(p2p, "SCIF peer-to-peer");
+
+module_param_named(p2p_proxy, mic_p2p_proxy_enable, bool, 0600);
+MODULE_PARM_DESC(p2p_proxy, "SCIF peer-to-peer proxy DMA support");
+
+module_param_named(pm_qos_cpu_dma_lat, mic_pm_qos_cpu_dma_lat, int, 0600);
+MODULE_PARM_DESC(pm_qos_cpu_dma_lat, "PM QoS CPU DMA latency in usecs.");
+
+module_param_named(numa_node, mic_host_numa_node, int, 0600);
+MODULE_PARM_DESC(numa_node, "Host Numa node to which MIC is attached");
+
+module_param_named(p2p_proxy_thresh, mic_p2p_proxy_thresh, ulong, 0600);
+MODULE_PARM_DESC(numa_node, "Transfer size after which Proxy DMA helps DMA perf");
+
+MODULE_LICENSE("GPL");
+MODULE_INFO(build_number, BUILD_NUMBER);
+MODULE_INFO(build_bywhom, BUILD_BYWHOM);
+MODULE_INFO(build_ondate, BUILD_ONDATE);
+MODULE_INFO(build_scmver, BUILD_SCMVER);
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/* SCIF Node Management */
+
+#include "mic/micscif.h"
+#ifndef _MIC_SCIF_
+#include "mic_common.h"
+
+#endif
+#include "mic/micscif_map.h"
+#include "mic/micscif_intr.h"
+#ifdef _MIC_SCIF_
+extern mic_dma_handle_t mic_dma_handle;
+#else
+extern bool mic_crash_dump_enabled;
+#endif
+
+
+/**
+ * micscif_create_node_dep:
+ *
+ * @dev: Remote SCIF device.
+ * @nr_pages: number of pages*
+ *
+ * Increment the map SCIF device ref count and notify the host if this is the
+ * first dependency being create between the two nodes.
+ */
+void
+micscif_create_node_dep(struct micscif_dev *dev, int nr_pages)
+{
+#ifdef SCIF_ENABLE_PM
+ struct nodemsg notif_msg;
+
+ if (dev) {
+ mutex_lock(&dev->sd_lock);
+ if (!dev->scif_map_ref_cnt) {
+ /* Notify Host if this is the first dependency being created */
+ notif_msg.uop = SCIF_NODE_CREATE_DEP;
+ notif_msg.src.node = ms_info.mi_nodeid;
+ notif_msg.payload[0] = dev->sd_node;
+ /* No error handling for Host SCIF device */
+ micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], ¬if_msg, NULL);
+ }
+ dev->scif_map_ref_cnt += nr_pages;
+ mutex_unlock(&dev->sd_lock);
+ }
+#endif
+}
+
+/**
+ * micscif_destroy_node_dep:
+ *
+ * @dev: Remote SCIF device.
+ * @nr_pages: number of pages
+ *
+ * Decrement the map SCIF device ref count and notify the host if a dependency
+ * no longer exists between two nodes.
+ */
+void
+micscif_destroy_node_dep(struct micscif_dev *dev, int nr_pages)
+{
+#ifdef SCIF_ENABLE_PM
+ struct nodemsg notif_msg;
+
+ if (dev) {
+ mutex_lock(&dev->sd_lock);
+ dev->scif_map_ref_cnt -= nr_pages;
+ if (!dev->scif_map_ref_cnt) {
+ /* Notify Host if all dependencies have been destroyed */
+ notif_msg.uop = SCIF_NODE_DESTROY_DEP;
+ notif_msg.src.node = ms_info.mi_nodeid;
+ notif_msg.payload[0] = dev->sd_node;
+ /* No error handling for Host SCIF device */
+ micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], ¬if_msg, NULL);
+ }
+ mutex_unlock(&dev->sd_lock);
+ }
+#endif
+}
+
+/**
+ * micscif_callback:
+ *
+ * @node: node id of the node added/removed.
+ * @event_type: SCIF_NODE_ADDED if a new node is added
+ * SCIF_NODE_REMOVED if a new node is removed
+ *
+ * Calls the callback function whenever a new node is added/removed
+ */
+static void micscif_callback(uint16_t node, enum scif_event_type event_type)
+{
+ struct list_head *pos;
+ struct scif_callback *temp;
+ union eventd event;
+
+ switch (event_type) {
+ case SCIF_NODE_ADDED:
+ event.scif_node_added = node;
+ break;
+ case SCIF_NODE_REMOVED:
+ event.scif_node_removed = node;
+ break;
+ default:
+ return;
+ }
+
+ mutex_lock(&ms_info.mi_event_cblock);
+ list_for_each(pos, &ms_info.mi_event_cb) {
+ temp = list_entry(pos, struct scif_callback, list_member);
+ temp->callback_handler(event_type, event);
+ }
+ mutex_unlock(&ms_info.mi_event_cblock);
+}
+
+/**
+ * micscif_node_remove_callback:
+ *
+ * @node: node id of the node removed.
+ *
+ * Calls the callback function whenever a new node is removed
+ */
+static void micscif_node_remove_callback(int node)
+{
+ micscif_callback((uint16_t)node, SCIF_NODE_REMOVED);
+}
+
+/**
+ * micscif_node_add_callback:
+ *
+ * @node: node id of the node added.
+ *
+ * Calls the callback function whenever a new node is added
+ */
+void micscif_node_add_callback(int node)
+{
+ micscif_callback((uint16_t)node, SCIF_NODE_ADDED);
+}
+
+void micscif_cleanup_qp(struct micscif_dev *dev)
+{
+ struct micscif_qp *qp;
+
+ qp = &dev->qpairs[0];
+
+ if (!qp)
+ return;
+
+ scif_iounmap((void*)qp->remote_qp, sizeof(struct micscif_qp), dev);
+ scif_iounmap((void*)dev->qpairs[0].outbound_q.rb_base, sizeof(struct micscif_qp), dev);
+ qp->remote_qp = NULL;
+ dev->qpairs[0].local_write = 0;
+ dev->qpairs[0].inbound_q.current_write_offset = 0;
+ dev->qpairs[0].inbound_q.current_read_offset = 0;
+#ifdef _MIC_SCIF_
+ kfree((void*)(qp->inbound_q.rb_base));
+ kfree(dev->qpairs);
+ qp = NULL;
+#endif
+}
+
+/*
+ * micscif_cleanup_scifdev
+ *
+ * @dev: Remote SCIF device.
+ * Uninitialize SCIF data structures for remote SCIF device.
+ */
+void micscif_cleanup_scifdev(struct micscif_dev *dev, bool destroy_wq)
+{
+ int64_t ret;
+#ifndef _MIC_SCIF_
+ mic_ctx_t *mic_ctx;
+#endif
+ if (SCIFDEV_NOTPRESENT == dev->sd_state) {
+#ifdef _MIC_SCIF_
+ /*
+ * If there are any stale qp allocated due to
+ * p2p connection failures then cleanup now
+ */
+ micscif_cleanup_qp(dev);
+#endif
+ return;
+ }
+
+ dev->sd_wait_status = OP_FAILED;
+ wake_up(&dev->sd_wq);
+
+#ifdef _MIC_SCIF_
+ /*
+ * Need to protect destruction of the workqueue since this code
+ * can be called from two contexts:
+ * a) Remove Node Handling.
+ * b) SCIF driver unload
+ */
+ mutex_lock(&dev->sd_lock);
+ if ((SCIFDEV_RUNNING != dev->sd_state) && (SCIFDEV_SLEEPING != dev->sd_state))
+ goto unlock;
+ dev->sd_state = SCIFDEV_STOPPED;
+ wake_up(&dev->sd_p2p_wq);
+ mutex_unlock(&dev->sd_lock);
+ deregister_scif_intr_handler(dev);
+ if (destroy_wq && dev->sd_intr_wq) {
+ destroy_workqueue(dev->sd_intr_wq);
+ dev->sd_intr_wq = NULL;
+ }
+#endif
+
+ mutex_lock(&dev->sd_lock);
+#ifndef _MIC_SCIF_
+ if ((SCIFDEV_RUNNING != dev->sd_state) && (SCIFDEV_SLEEPING != dev->sd_state))
+ goto unlock;
+ dev->sd_state = SCIFDEV_STOPPED;
+#endif
+ /*
+ * Change the state of the remote SCIF device
+ * to idle as soon as the activity counter is
+ * zero. The node state and ref count is
+ * maintained within a single atomic_long_t.
+ * No timeout for this tight loop since we expect
+ * the node to complete the API it is currently
+ * executing following which the scif_ref_count
+ * will drop to zero.
+ */
+ do {
+ ret = atomic_long_cmpxchg(
+ &dev->scif_ref_cnt, 0, SCIF_NODE_IDLE);
+ cpu_relax();
+ } while (ret && ret != SCIF_NODE_IDLE);
+
+ mutex_unlock(&dev->sd_lock);
+ /* Cleanup temporary registered windows */
+ flush_workqueue(ms_info.mi_misc_wq);
+ mutex_lock(&dev->sd_lock);
+
+#ifdef _MIC_SCIF_
+ drain_dma_global(mic_dma_handle);
+#else
+ mic_ctx = get_per_dev_ctx(dev->sd_node - 1);
+ drain_dma_global(mic_ctx->dma_handle);
+ micscif_destroy_p2p(mic_ctx);
+#endif
+ scif_invalidate_ep(dev->sd_node);
+ micscif_kill_apps_with_mmaps(dev->sd_node);
+
+ micscif_cleanup_qp(dev);
+ mutex_unlock(&dev->sd_lock);
+#ifndef _MIC_SCIF_
+ mutex_lock(&ms_info.mi_conflock);
+ ms_info.mi_mask &= ~(0x1 << dev->sd_node);
+ ms_info.mi_total--;
+ mutex_unlock(&ms_info.mi_conflock);
+#endif
+
+ /* Wait for all applications to unmap remote memory mappings. */
+ wait_event(dev->sd_mmap_wq,
+ !micscif_rma_do_apps_have_mmaps(dev->sd_node));
+ micscif_cleanup_rma_for_zombies(dev->sd_node);
+ micscif_node_remove_callback(dev->sd_node);
+ return;
+unlock:
+ mutex_unlock(&dev->sd_lock);
+}
+
+/*
+ * micscif_remove_node:
+ *
+ * @mask: bitmask of nodes in the deactivation set.
+ * @flags: Type of deactivation set i.e. Power Management,
+ * RAS, Maintenance Mode etc.
+ * @block: Can block.
+ *
+ * Attempt to deactivate a set of remote SCIF devices nodes passed in mask.
+ * If the SCIF activity ref count is positive for a remote node then
+ * the approporiate bit in the input bitmask is reset and the resultant
+ * bitmask is returned.
+ */
+uint64_t micscif_handle_remove_node(uint64_t mask, uint64_t payload)
+{
+ int64_t ret;
+ int err = 0;
+ uint32_t i;
+ struct micscif_dev *dev;
+ uint64_t flags = 0;
+ flags = payload & 0x00000000FFFFFFFF;
+
+ switch(flags) {
+ case DISCONN_TYPE_POWER_MGMT:
+ {
+ uint8_t *nodemask_buf = NULL;
+ int size = payload >> 32;
+
+#ifndef _MIC_SCIF_
+ nodemask_buf = mic_data.dd_pm.nodemask;
+#else
+ nodemask_buf = scif_ioremap(mask, size, &scif_dev[SCIF_HOST_NODE]);
+#endif
+ if (!nodemask_buf) {
+ err = EAGAIN;
+ break;
+ }
+
+ for (i = 0; i <= ms_info.mi_maxid; i++) {
+ dev = &scif_dev[i];
+ if (!get_nodemask_bit(nodemask_buf , i))
+ continue;
+ /*
+ * Try for the SCIF device lock. Bail out if
+ * it is already grabbed since some other
+ * thread is already working on some other
+ * node state transition for this remote SCIF device.
+ */
+ if (mutex_trylock(&dev->sd_lock)) {
+
+ if (SCIFDEV_RUNNING != dev->sd_state) {
+ mutex_unlock(&dev->sd_lock);
+ continue;
+ }
+ /*
+ * Change the state of the remote SCIF device
+ * to idle only if the activity counter is
+ * already zero. The node state and ref count
+ * is maintained within a single atomic_long_t.
+ */
+ ret = atomic_long_cmpxchg(
+ &dev->scif_ref_cnt, 0, SCIF_NODE_IDLE);
+
+ if (!ret || ret == SCIF_NODE_IDLE) {
+ if (!ret) {
+#ifdef _MIC_SCIF_
+ drain_dma_global(mic_dma_handle);
+#else
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(dev->sd_node - 1);
+ drain_dma_global(mic_ctx->dma_handle);
+#endif
+ }
+ /*
+ * Turn off the remote SCIF device.
+ * Any communication to this SCIF
+ * after this point will require a
+ * wake up message to the host.
+ */
+ dev->sd_state = SCIFDEV_SLEEPING;
+ err = 0;
+ }
+ else {
+ /*
+ * Cannot put the remote SCIF device
+ * to sleep.
+ */
+ err = EAGAIN;
+ mutex_unlock(&dev->sd_lock);
+ break;
+ }
+ mutex_unlock(&dev->sd_lock);
+ } else {
+ err = EAGAIN;
+ break;
+ }
+ }
+
+#ifndef _MIC_SCIF_
+ scif_iounmap(nodemask_buf, size, &scif_dev[SCIF_HOST_NODE]);
+#endif
+
+ break;
+ }
+ case DISCONN_TYPE_LOST_NODE:
+ {
+ /* In the case of lost node, first paramater
+ * is the node id and not a mask.
+ */
+ dev = &scif_dev[mask];
+ micscif_cleanup_scifdev(dev, !DESTROY_WQ);
+ break;
+ }
+ default:
+ {
+ /* Unknown remove node flags */
+ BUG_ON(1);
+ }
+ }
+
+ return err;
+}
+
+/**
+ * set_nodemask_bit:
+ *
+ * @node_id[in]: node id to be set in the mask
+ *
+ * Set bit in the nodemask. each bit represents node. set bit to add node in to
+ * activation/de-activation set
+ */
+//void
+//set_nodemask_bit(uint64_t *nodemask, uint32_t node_id)
+void
+set_nodemask_bit(uint8_t* nodemask, uint32_t node_id, int val)
+{
+ int index = 0;
+ uint8_t *temp_mask;
+
+ index = (int) node_id / 8;
+ temp_mask = nodemask + index;
+ node_id = node_id - (index * 8);
+ if (val)
+ *temp_mask |= (1ULL << node_id);
+ else
+ *temp_mask &= ~(1ULL << node_id);
+}
+
+/**
+ * check_nodemask_bit:
+ *
+ * @node_id[in]: node id to be set in the mask
+ *
+ * Check if a bit in the nodemask corresponding to a
+ * node id is set.
+ *
+ * return 1 if the bit is set. 0 if the bit is cleared.
+ */
+int
+get_nodemask_bit(uint8_t* nodemask, uint32_t node_id) {
+ int index = 0;
+ uint8_t *temp_mask;
+
+ index = (int) node_id / 8;
+ temp_mask = nodemask + index;
+ node_id = node_id - (index * 8);
+ return *temp_mask & (1ULL << node_id);
+
+}
+/**
+* nodemask_isvalid - Check if a nodemask is valid after
+* calculating the de-activation set.
+*
+* @nodemask[in]: The nodemask to be checked.
+*
+* Returns true if valid.
+*/
+bool nodemask_isvalid(uint8_t* nodemask) {
+ uint32_t i;
+ for (i = 0; i <= ms_info.mi_maxid; i++) {
+ if (get_nodemask_bit(nodemask, i))
+ return true;
+ }
+
+ return false;
+}
+
+#ifndef _MIC_SCIF_
+/*
+ * micscif_send_rmnode_msg:
+ *
+ * @mask: Bitmask of nodes in the deactivation set.
+ * @node: Destination node for a deactivation set.
+ * @flags: Type of deactivation set i.e. Power Management,
+ * RAS, Maintenance Mode etc.
+ * @orig_node: The node which triggered this remove node message.
+ *
+ * Sends a deactivation request to the valid nodes not included in the
+ * deactivation set from the Host and waits for a response.
+ * Returns the response mask received from the node.
+ */
+uint64_t micscif_send_pm_rmnode_msg(int node, uint64_t nodemask_addr,
+ uint64_t nodemask_size, int orig_node) {
+
+ uint64_t ret;
+ struct nodemsg notif_msg;
+ struct micscif_dev *dev = &scif_dev[node];
+
+ /*
+ * Send remove node msg only to running nodes.
+ * An idle node need not know about another _lost_ node
+ * until it wakes up. When it does, it will request the
+ * host to wake up the _lost_ node to which the host will
+ * respond with a NACK
+ */
+
+ if (SCIFDEV_RUNNING != dev->sd_state)
+ return -ENODEV;
+
+ notif_msg.uop = SCIF_NODE_REMOVE;
+ notif_msg.src.node = ms_info.mi_nodeid;
+ notif_msg.dst.node = node;
+ notif_msg.payload[0] = nodemask_addr;
+ notif_msg.payload[1] = DISCONN_TYPE_POWER_MGMT;
+ notif_msg.payload[1] |= (nodemask_size << 32);
+ notif_msg.payload[2] = atomic_long_read(&ms_info.mi_unique_msgid);
+ notif_msg.payload[3] = orig_node;
+ /* Send the request to remove a set of nodes */
+ pr_debug("Send PM rmnode msg for node %d to node %d\n", orig_node, node);
+ ret = micscif_nodeqp_send(dev, ¬if_msg, NULL);
+
+ return ret;
+}
+
+uint64_t micscif_send_lost_node_rmnode_msg(int node, int orig_node) {
+ uint64_t ret;
+ struct nodemsg notif_msg;
+ struct micscif_dev *dev = &scif_dev[node];
+
+ /*
+ * Send remove node msg only to running nodes.
+ * An idle node need not know about another _lost_ node
+ * until it wakes up. When it does, it will request the
+ * host to wake up the _lost_ node to which the host will
+ * respond with a NACK
+ */
+ if (SCIFDEV_RUNNING != dev->sd_state)
+ return -ENODEV;
+
+ micscif_inc_node_refcnt(dev, 1);
+ notif_msg.uop = SCIF_NODE_REMOVE;
+ notif_msg.src.node = ms_info.mi_nodeid;
+ notif_msg.dst.node = node;
+ notif_msg.payload[0] = orig_node;
+ notif_msg.payload[1] = DISCONN_TYPE_LOST_NODE;
+ notif_msg.payload[3] = orig_node;
+ /* Send the request to remove a set of nodes */
+ ret = micscif_nodeqp_send(dev, ¬if_msg, NULL);
+ micscif_dec_node_refcnt(dev, 1);
+
+ return ret;
+}
+
+/*
+ * micpm_nodemask_uninit:
+ * @node - node to uninitalize
+ *
+ * Deallocate memory for per-card nodemask buffer
+*/
+void
+micpm_nodemask_uninit(mic_ctx_t* mic_ctx)
+{
+ if (mic_ctx && mic_ctx->micpm_ctx.nodemask.va) {
+ mic_ctx_unmap_single(mic_ctx, mic_ctx->micpm_ctx.nodemask.pa,
+ mic_ctx->micpm_ctx.nodemask.len);
+ kfree(mic_ctx->micpm_ctx.nodemask.va);
+ }
+}
+
+/*
+ * micpm_nodemask_init:
+ * @num_devs - no of scif nodes including the host
+ * @node - node to initialize
+ *
+ * Allocate memory for per-card nodemask buffer
+*/
+int
+micpm_nodemask_init(uint32_t num_devs, mic_ctx_t* mic_ctx)
+{
+ if (!mic_ctx)
+ return 0;
+
+ mic_ctx->micpm_ctx.nodemask.len = ((int) (num_devs / 8) +
+ ((num_devs % 8) ? 1 : 0));
+ mic_ctx->micpm_ctx.nodemask.va = (uint8_t *)
+ kzalloc(mic_ctx->micpm_ctx.nodemask.len, GFP_KERNEL);
+
+ if (!mic_ctx->micpm_ctx.nodemask.va) {
+ PM_DEBUG("Error allocating nodemask buffer\n");
+ return -ENOMEM;
+ }
+
+ mic_ctx->micpm_ctx.nodemask.pa = mic_ctx_map_single(mic_ctx,
+ mic_ctx->micpm_ctx.nodemask.va,
+ mic_ctx->micpm_ctx.nodemask.len);
+
+ if(mic_map_error(mic_ctx->micpm_ctx.nodemask.pa)) {
+ PM_PRINT("Error Mapping nodemask buffer\n");
+ kfree(mic_ctx->micpm_ctx.nodemask.va);
+ }
+ return 0;
+}
+
+/**
+ * micpm_disconn_uninit:
+ * @num_devs - no of scif nodes including host
+ * Note - can not use ms_info.mi_total(total no of scif nodes) as it is updated after the driver load is complete
+ *
+ * Reset/re-initialize data structures needed for PM disconnection. This is necessary everytime the board is reset.
+ * Since host(node 0)represents one of the node in network, it is necessary to clear dependency of host with the given node
+ */
+int
+micpm_disconn_uninit(uint32_t num_devs)
+{
+ uint32_t i;
+ uint32_t status = 0;
+
+ /*
+ * ms_info.mi_total is updated after the driver load is complete
+ * switching back to static allocation of max nodes
+ */
+
+ if (ms_info.mi_depmtrx) {
+
+ for (i = 0; i < (int)num_devs; i++) {
+ if (ms_info.mi_depmtrx[i]) {
+ kfree(ms_info.mi_depmtrx[i]);
+ }
+ }
+ kfree(ms_info.mi_depmtrx);
+ }
+
+ if (mic_data.dd_pm.nodemask)
+ kfree(mic_data.dd_pm.nodemask);
+
+ return status;
+}
+
+/**
+ * micpm_disconn_init:
+ * @num_devs - no of scif nodes including host
+ * Note - can not use ms_info.mi_total(total no of scif nodes) as it is updated after the driver load is complete
+ *
+ * Allocate memory for dependency graph. Initialize dependencies for the node.
+ * The memory allocated is based on the no of devices present during driver load.
+ */
+int
+micpm_disconn_init(uint32_t num_devs)
+{
+ uint32_t i;
+ uint32_t status = 0;
+ mic_ctx_t *mic_ctx;
+
+ if (ms_info.mi_depmtrx)
+ return status;
+
+ ms_info.mi_depmtrx = (uint32_t**)kzalloc(sizeof(uint32_t*) * num_devs, GFP_KERNEL);
+ if (!ms_info.mi_depmtrx) {
+ pr_debug("dependency graph initialization failed\n");
+ status = -ENOMEM;
+ goto exit;
+ }
+
+ for (i = 0; i < (int)num_devs; i++) {
+ ms_info.mi_depmtrx[i] = (uint32_t*)kzalloc(sizeof(uint32_t) * num_devs, GFP_KERNEL);
+ if (!ms_info.mi_depmtrx[i]) {
+ micpm_disconn_uninit(num_devs);
+ pr_debug("dependency graph initialization failed\n");
+ status = -ENOMEM;
+ goto exit;
+ }
+ }
+ init_waitqueue_head(&ms_info.mi_disconn_wq);
+ atomic_long_set(&ms_info.mi_unique_msgid, 0);
+
+ //In Windows, this code is executed during micpm_probe
+ for(i = 0; i < (num_devs - 1); i++) {
+ mic_ctx = get_per_dev_ctx(i);
+ status = micpm_nodemask_init(num_devs, mic_ctx);
+ if (status)
+ goto exit;
+ }
+
+ /* Set up a nodemask buffer for Host scif node in a common pm_ctx */
+ mic_data.dd_pm.nodemask_len = ((int) (num_devs / 8) +
+ ((num_devs % 8) ? 1 : 0));
+ mic_data.dd_pm.nodemask = (uint8_t *)
+ kzalloc(mic_data.dd_pm.nodemask_len, GFP_KERNEL);
+
+ if (!mic_data.dd_pm.nodemask) {
+ PM_DEBUG("Error allocating nodemask buffer\n");
+ status = -ENOMEM;
+ goto exit;
+ }
+
+exit:
+ return status;
+}
+
+/**
+ * micscif_set_nodedep:
+ *
+ * @src_node: node which is creating dependency.
+ * @dst_node: node on which dependency is being created
+ *
+ * sets the given value in dependency graph for src_node -> dst_node
+ */
+void
+micscif_set_nodedep(uint32_t src_node, uint32_t dst_node, enum dependency_state state)
+{
+ /* We dont need to lock dependency graph while updating
+ * as every node will modify its own row
+ */
+ if (ms_info.mi_depmtrx)
+ ms_info.mi_depmtrx[src_node][dst_node] = state;
+}
+
+/**
+ * micscif_get_nodedep:
+ *
+ * @src_node: node which has/has not created dependency.
+ * @dst_node: node on which dependency was/was not created
+ *
+ * gets the current value in dependency graph for src_node -> dst_node
+ */
+enum dependency_state
+micscif_get_nodedep(uint32_t src_node, uint32_t dst_node)
+{
+ enum dependency_state state = DEP_STATE_NOT_DEPENDENT;
+ if (ms_info.mi_depmtrx)
+ state = ms_info.mi_depmtrx[src_node][dst_node];
+ return state;
+}
+
+/**
+ * init_depgraph_stack:
+ *
+ * @stack_ptr: list head.
+ *
+ * Initialize linked list to be used as stack
+ */
+int
+init_depgraph_stack(struct list_head *stack_ptr)
+{
+ int status = 0;
+
+ if (!stack_ptr) {
+ pr_debug("%s argument stack_ptr is invalid\n", __func__);
+ status = -EINVAL;
+ goto exit;
+ }
+ /* Initialize stack */
+ INIT_LIST_HEAD(stack_ptr);
+
+exit:
+ return status;
+}
+
+/**
+ * uninit_depgraph_stack:
+ *
+ * @stack_ptr: list head for linked list(stack).
+ *
+ * Empty stack(linked list). Pop all the nodes left in the stack.
+ */
+int
+uninit_depgraph_stack(struct list_head *stack_ptr)
+{
+ int status = 0;
+ uint32_t node_id;
+ if (!stack_ptr) {
+ pr_debug("%s argument stack_ptr is invalid\n", __func__);
+ status = -EINVAL;
+ goto exit;
+ }
+
+ /* pop all the nodes left in the stack */
+ while (!is_stack_empty(stack_ptr)) {
+ status = stack_pop_node(stack_ptr, &node_id);
+ if (status) {
+ pr_debug("%s error while cleaning up depgraph stack\n", __func__);
+ status = -EINVAL;
+ goto exit;
+ }
+ }
+
+exit:
+ return status;
+}
+
+/**
+ * is_stack_empty:
+ *
+ * @stack_ptr: list head for linked list(stack).
+ *
+ * returns true if the stack is empty.
+ */
+int
+is_stack_empty(struct list_head *stack_ptr)
+{
+ if(list_empty(stack_ptr)) {
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * stack_push_node:
+ *
+ * @stack_ptr[in]: list head for linked list(stack).
+ * @node_id[in]: node id to be pushed
+ *
+ * Push node in to the stack i.e. create node and add it at the start of linked list
+ */
+int
+stack_push_node(struct list_head *stack_ptr, uint32_t node_id)
+{
+ int status = 0;
+ struct stack_node *datanode = NULL;
+
+ datanode = kmalloc(sizeof(struct stack_node), GFP_KERNEL);
+ if (!datanode) {
+ pr_debug("%s error allocating memory to stack node.\n", __func__);
+ status = -ENOMEM;
+ goto exit;
+ }
+
+ datanode->node_id = node_id;
+ list_add(&datanode->next, stack_ptr);
+exit:
+ return status;
+}
+
+/**
+ * stack_pop_node:
+ *
+ * @stack_ptr[in]: list head for linked list(stack).
+ * @node_id[out]: pointer to the node id to be popped
+ *
+ * Pop node from the stack i.e. delete first entry of linked list and return its data.
+ */
+int
+stack_pop_node(struct list_head *stack_ptr, uint32_t *node_id)
+{
+ int status = 0;
+ struct stack_node *datanode = NULL;
+
+ if(is_stack_empty(stack_ptr)) {
+ pr_debug("%s stack found empty when tried to pop\n", __func__);
+ status = -EFAULT;
+ goto exit;
+ }
+
+ datanode = list_first_entry(stack_ptr, struct stack_node, next);
+ if (!datanode) {
+ pr_debug("%s Unable to pop from stack\n", __func__);
+ status = -EFAULT;
+ goto exit;
+ }
+ *node_id = datanode->node_id;
+
+ list_del(&datanode->next);
+ if (datanode) {
+ kfree(datanode);
+ }
+
+exit:
+ return status;
+}
+
+/**
+ * micscif_get_activeset:
+ *
+ * @node_id[in]: source node id.
+ * @nodemask[out]: bitmask of nodes present in activation set
+ *
+ * Algorithm to find out activation set for the given source node. Activation set is used to re-connect node into
+ * the scif network.
+ */
+int
+micscif_get_activeset(uint32_t node_id, uint8_t *nodemask)
+{
+ int status = 0;
+ uint32_t i = 0;
+ struct list_head stack;
+ uint8_t visited[128] = {0}; // 128 is max number of nodes.
+ uint32_t num_nodes = ms_info.mi_maxid + 1;
+ mic_ctx_t *mic_ctx;
+
+ if (!ms_info.mi_depmtrx) {
+ status = -EINVAL;
+ goto exit;
+ }
+
+ status = init_depgraph_stack(&stack);
+ if (status) {
+ pr_debug("%s failed to initilize depgraph stack\n", __func__);
+ goto exit;
+ }
+
+ status = stack_push_node(&stack, node_id);
+ if (status) {
+ pr_debug("%s error while running activation set algorithm\n", __func__);
+ goto exit;
+ }
+
+ /* mark node visited to avoid repetition of the algorithm for the same node */
+ visited[node_id] = 1;
+
+ while (!is_stack_empty(&stack)) {
+ status = stack_pop_node(&stack, &node_id);
+ if (status) {
+ pr_debug("%s error while running activation set algorithm\n", __func__);
+ goto exit;
+ }
+
+ /* include node_id in the activation set*/
+ set_nodemask_bit(nodemask, node_id, 1);
+
+ for (i = 0; i < num_nodes; i++) {
+ /* check if node has dependency on any node 'i' which is also disconnected at this time*/
+ if ((!visited[i]) && (ms_info.mi_depmtrx[node_id][i] == DEP_STATE_DISCONNECTED)) {
+ visited[i] = 1;
+ if (i == 0)
+ continue;
+ mic_ctx = get_per_dev_ctx(i - 1);
+ if ((mic_ctx->micpm_ctx.idle_state == PM_IDLE_STATE_PC3) ||
+ (mic_ctx->micpm_ctx.idle_state == PM_IDLE_STATE_PC6)) {
+ status = stack_push_node(&stack, i);
+ if (status) {
+ pr_debug("%s error while running activation set algorithm\n", __func__);
+ goto exit;
+ }
+ }
+ }
+ }
+ } /* end of while (!is_stack_empty(&stack)) */
+exit:
+ uninit_depgraph_stack(&stack);
+ return status;
+}
+
+/**
+ * micscif_get_minimal_deactiveset:
+ *
+ * @node_id[in]: source node id.
+ * @nodemask[out]: bitmask of nodes present in de-activation set
+ * @visited[in/out]: information of which nodes are already visited in de-activation set algorithm
+ *
+ * Algorithm to find out minimum/must de-activation set for the given source node. This method is part of and used by
+ * micscif_get_deactiveset.
+ */
+int micscif_get_minimal_deactiveset(uint32_t node_id, uint8_t *nodemask, uint8_t *visited)
+{
+ int status = 0;
+ uint32_t i = 0;
+ struct list_head stack;
+ uint32_t num_nodes = ms_info.mi_maxid + 1;
+
+ if (!ms_info.mi_depmtrx) {
+ status = -EINVAL;
+ goto exit;
+ }
+
+ status = init_depgraph_stack(&stack);
+ if (!visited) {
+ pr_debug("%s invalid parameter visited", __func__);
+ status = -EINVAL;
+ goto exit_pop;
+ }
+
+ if (status) {
+ pr_debug("%s failed to initilize depgraph stack\n", __func__);
+ goto exit_pop;
+ }
+
+ status = stack_push_node(&stack, node_id);
+ if (status) {
+ pr_debug("%s error while running de-activation set algorithm\n", __func__);
+ goto exit_pop;
+ }
+
+ /* mark node visited to avoid repetition of the algorithm for the same node */
+ visited[node_id] = 1;
+
+ while (!is_stack_empty(&stack)) {
+
+ status = stack_pop_node(&stack, &node_id);
+ if (status) {
+ pr_debug("%s error while running de-activation set algorithm\n", __func__);
+ goto exit_pop;
+ }
+
+ /* include node_id in the activation set*/
+ set_nodemask_bit(nodemask, node_id, 1);
+
+ for (i = 0; i < num_nodes; i++) {
+ if (!visited[i]) {
+ if (ms_info.mi_depmtrx[i][node_id] == DEP_STATE_DEPENDENT) {
+ /* The algorithm terminates, if we find any dependent node active */
+ status = -EOPNOTSUPP;
+ goto exit_pop;
+ } else if(ms_info.mi_depmtrx[i][node_id] == DEP_STATE_DISCONNECT_READY) {
+ /* node is dependent but ready to get disconnected */
+ visited[i] = 1;
+ status = stack_push_node(&stack, i);
+ if (status) {
+ pr_debug("%s error while running de-activation set algorithm\n", __func__);
+ goto exit_pop;
+ }
+ }
+ }
+ }
+ }/*end of while(!is_stack_empty(&stack))*/
+
+exit_pop:
+ while (!is_stack_empty(&stack)) {
+ status = stack_pop_node(&stack, &node_id);
+ if (status) {
+ pr_debug("%s error while running activation set algorithm\n", __func__);
+ break;
+ }
+ if (visited)
+ visited[node_id] = 0;
+ }
+exit:
+ return status;
+}
+
+/**
+ * micscif_get_deactiveset:
+ *
+ * @node_id[in]: source node id.
+ * @nodemask[out]: bitmask of nodes present in de-activation set
+ * @max_disconn: flag to restrict de-activation set algoritthm to minimum/must set.
+ * True value indicates maximum de-activation set
+ *
+ * Algorithm to find out de-activation set for the given source node. De-activation set is used to disconnect node into
+ * the scif network. The algorithm can find out maximum possible de-activation set(required in situations like
+ * power management)if the max_possible flag is set.
+ */
+int
+micscif_get_deactiveset(uint32_t node_id, uint8_t *nodemask, int max_disconn)
+{
+ int status = 0;
+ uint32_t i = 0;
+ struct list_head stack;
+ uint8_t *visited = NULL;
+ uint8_t cont_next_step = 0;
+ uint32_t num_nodes = ms_info.mi_maxid + 1;
+ mic_ctx_t *mic_ctx;
+
+ if (!ms_info.mi_depmtrx) {
+ status = -EINVAL;
+ goto exit;
+ }
+
+ status = init_depgraph_stack(&stack);
+ if (status) {
+ pr_debug("%s failed to initilize depgraph stack\n", __func__);
+ goto exit;
+ }
+
+ visited = kzalloc(sizeof(uint8_t) * num_nodes, GFP_KERNEL);
+ if (!visited) {
+ pr_debug("%s failed to allocated memory for visited array", __func__);
+ status = -ENOMEM;
+ goto exit;
+ }
+
+ status = stack_push_node(&stack, node_id);
+ if (status) {
+ pr_debug("%s error while running de-activation set algorithm\n", __func__);
+ goto exit;
+ }
+
+ while (!is_stack_empty(&stack)) {
+
+ status = stack_pop_node(&stack, &node_id);
+ if (status) {
+ pr_debug("%s error while running de-activation set algorithm\n", __func__);
+ goto exit;
+ }
+
+ /* check if we want to find out maximum possible de-activation set */
+ if (max_disconn) {
+ cont_next_step = 1;
+ }
+
+ if (!visited[node_id]) {
+ status = micscif_get_minimal_deactiveset(node_id, nodemask, visited);
+ if (status) {
+ if (status == -EOPNOTSUPP) {
+ pr_debug("%s No deactivation set found for node %d", __func__, node_id);
+ cont_next_step = 0;
+ }
+ else {
+ pr_debug("%s Failed to calculate deactivation set", __func__);
+ goto exit;
+ }
+ }
+
+ } /* end for if (!visited[node_id]) */
+
+ if (cont_next_step) {
+ for (i = 0; i < num_nodes; i++) {
+ /* check if we can put more nodes 'i' in de-activation set if this node(dependent node)
+ * is de-activating
+ */
+ if ((!visited[i]) &&
+ (ms_info.mi_depmtrx[node_id][i] == DEP_STATE_DISCONNECT_READY)) {
+ if (i == 0)
+ continue;
+ mic_ctx = get_per_dev_ctx(i - 1);
+ if (mic_ctx->micpm_ctx.idle_state ==
+ PM_IDLE_STATE_PC3_READY) {
+ /* This node might be able to get into deactivation set */
+ status = stack_push_node(&stack, i);
+ if (status) {
+ pr_debug("%s error while running de-activation set algorithm\n", __func__);
+ goto exit;
+ }
+ }
+ }
+ }
+ }
+ } /* end for while (!is_stack_empty(&stack)) */
+
+ if (!nodemask_isvalid(nodemask)) {
+ pr_debug("%s No deactivation set found for node %d",
+ __func__, node_id);
+ status = -EOPNOTSUPP;
+ }
+exit:
+ if (visited) {
+ kfree(visited);
+ }
+ uninit_depgraph_stack(&stack);
+ return status;
+}
+
+/* micscif_update_p2p_state:
+ *
+ * Update the p2p_disc_state of peer node peer_id in the p2p list of node node_id.
+ *
+ * @node_id: The node id whose p2p list needs to be updated.
+ * @peer_id: The node id in the p2p list of the node_id that will get updated.
+ * @scif_state: The state to be updated to.
+ *
+ */
+void micscif_update_p2p_state(uint32_t node_id, uint32_t peer_id, enum scif_state state) {
+
+ struct micscif_dev *dev;
+ struct list_head *pos, *tmp;
+ struct scif_p2p_info *p2p;
+
+ dev = &scif_dev[node_id];
+ if (!list_empty(&dev->sd_p2p)) {
+ list_for_each_safe(pos, tmp, &dev->sd_p2p) {
+ p2p = list_entry(pos, struct scif_p2p_info,
+ ppi_list);
+ if(p2p->ppi_peer_id == peer_id) {
+ p2p->ppi_disc_state = state;
+ break;
+ }
+ }
+ }
+}
+
+/* micscif_p2p_node_exists: Check if a node exists in the
+ * list of nodes that have been sent an rmnode message.
+ *
+ * node_list: The list that contains the nodes that has been
+ * sent the rmnode message for this transaction.
+ * node_id: the node to be searched for.
+ *
+ * returns: true of the node exists.False otherwise
+ */
+bool micscif_rmnode_msg_sent(struct list_head *node_list, uint32_t node_id) {
+
+ struct list_head *pos1, *tmp1;
+ struct stack_node *added_node;
+
+ if (!list_empty(node_list)) {
+ list_for_each_safe(pos1, tmp1, node_list) {
+ added_node = list_entry(pos1, struct stack_node, next);
+ if(added_node->node_id == node_id)
+ return true;
+ }
+ }
+ return false;
+}
+
+/**
+ * micscif_execute_disconnecte: Perform PM disconnection of a node
+ * with its neighboring nodes.
+ *
+ * node_id: The node to be disconnected.
+ * nodemask: Mask containing the list of nodes (including node_id)
+ * to be disconnected.
+ * node_list: List of nodes that received the disconnection message.
+ */
+int micscif_execute_disconnect(uint32_t node_id,
+ uint8_t *nodemask,
+ struct list_head *node_list)
+{
+ uint32_t status = 0;
+ int ret;
+ uint64_t msg_cnt = 0;
+ uint32_t i = 0;
+ int pending_wakeups = 0;
+ mic_ctx_t *send_rmnode_ctx;
+ uint32_t node;
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(node_id - 1);
+ struct scif_p2p_info *p2p;
+ struct list_head *pos, *tmp;
+ struct micscif_dev *dev;
+
+
+ /* Always send rmnode msg to SCIF_HOST_NODE */
+ memcpy(mic_data.dd_pm.nodemask, nodemask,
+ mic_data.dd_pm.nodemask_len);
+ ret = (int) micscif_send_pm_rmnode_msg(SCIF_HOST_NODE, 0, mic_data.dd_pm.nodemask_len,
+ node_id);
+ /* Add this node to msg list. */
+ if(!ret) {
+ msg_cnt++;
+ stack_push_node(node_list, SCIF_HOST_NODE);
+ }
+
+ if((ret == 0)||(ret == -ENODEV)) {
+ status = 0;
+ }
+
+ /* For each node in the nodemask, traverse its p2p list
+ * and send rmnode_msg to those nodes 1) That are not also
+ * in the node mask and 2) That have not been already sent
+ * rmnode messages in this transaction and 3) That have
+ * their disconnection state as RUNNING.
+ */
+ for (i = 0; i <= ms_info.mi_maxid; i++) {
+ /* verify if the node is present in deactivation set */
+ if (!get_nodemask_bit(nodemask, i))
+ continue;
+
+ /* Get to the p2p list of this node */
+ dev = &scif_dev[i];
+ list_for_each_safe(pos, tmp, &dev->sd_p2p) {
+ p2p = list_entry(pos, struct scif_p2p_info,
+ ppi_list);
+
+ if (get_nodemask_bit(nodemask, p2p->ppi_peer_id))
+ continue;
+ if (p2p->ppi_disc_state == SCIFDEV_SLEEPING)
+ continue;
+
+ if(micscif_rmnode_msg_sent(node_list, p2p->ppi_peer_id))
+ continue;
+ send_rmnode_ctx = get_per_dev_ctx(p2p->ppi_peer_id - 1);
+ if (!send_rmnode_ctx->micpm_ctx.nodemask.va) {
+ status = -EINVAL;
+ goto list_cleanup;
+ }
+
+ memcpy(send_rmnode_ctx->micpm_ctx.nodemask.va, nodemask,
+ send_rmnode_ctx->micpm_ctx.nodemask.len);
+ ret = (int) micscif_send_pm_rmnode_msg(p2p->ppi_peer_id,
+ send_rmnode_ctx->micpm_ctx.nodemask.pa,
+ send_rmnode_ctx->micpm_ctx.nodemask.len,node_id);
+
+ /* Add this node to msg list. */
+ if(!ret) {
+ msg_cnt++;
+ stack_push_node(node_list, p2p->ppi_peer_id);
+ }
+
+ if((ret == 0)||(ret == -ENODEV)) {
+ status = 0;
+ }
+ }
+ }
+
+ ret = wait_event_timeout(ms_info.mi_disconn_wq,
+ (atomic_read(&mic_ctx->disconn_rescnt) == msg_cnt) ||
+ (pending_wakeups = atomic_read(&mic_data.dd_pm.wakeup_in_progress)),
+ NODE_ALIVE_TIMEOUT);
+ if ((!ret) || (atomic_read(&mic_ctx->disconn_rescnt) != msg_cnt)
+ || (ms_info.mi_disconnect_status == OP_FAILED)) {
+ pr_debug("SCIF disconnect failed. "
+ "remove_node messages sent: = %llu "
+ "remove_node acks received: %d "
+ "Pending wakeups: %d ret = %d\n", msg_cnt,
+ atomic_read(&mic_ctx->disconn_rescnt),
+ pending_wakeups, ret);
+
+ status = -EAGAIN;
+ goto list_cleanup;
+ }
+ return status;
+
+list_cleanup:
+ while (!is_stack_empty(node_list))
+ stack_pop_node(node_list, &node);
+ return status;
+}
+
+/**
+ * micscif_node_disconnect:
+ *
+ * @node_id[in]: source node id.
+ * @nodemask[out]: bitmask of nodes that have to be disconnected together.
+ * it represents node_id
+ * @disconn_type[in]: flag to identify disconnection type. (for example - power mgmt, lost node, maintenance mode etc)
+ *
+ * Method responsible for disconnecting node from the scif network. considers dependencies with other node.
+ * finds out deactivation set. Sends node queue pair messages to all the scif nodes outside deactivation set
+ * returns error if node can not be disconnected from the network.
+ */
+int micscif_disconnect_node(uint32_t node_id, uint8_t *nodemask, enum disconn_type type)
+{
+ uint32_t status = 0;
+ int ret;
+ uint64_t msg_cnt = 0;
+ uint32_t i = 0;
+ mic_ctx_t *mic_ctx = 0;
+ struct list_head node_list;
+ uint32_t node;
+
+ if (!node_id)
+ return -EINVAL;
+
+ mic_ctx = get_per_dev_ctx(node_id - 1);
+
+ if (!mic_ctx)
+ return -EINVAL;
+
+ switch(type) {
+ case DISCONN_TYPE_POWER_MGMT:
+ {
+ if (!nodemask)
+ return -EINVAL;
+
+ atomic_long_add(1, &ms_info.mi_unique_msgid);
+ atomic_set(&mic_ctx->disconn_rescnt, 0);
+ ms_info.mi_disconnect_status = OP_IN_PROGRESS;
+ INIT_LIST_HEAD(&node_list);
+
+ status = micscif_execute_disconnect(node_id,
+ nodemask, &node_list);
+ if (status)
+ return status;
+
+ /* Reset unique msg_id */
+ atomic_long_set(&ms_info.mi_unique_msgid, 0);
+
+ while (!is_stack_empty(&node_list)) {
+ status = stack_pop_node(&node_list, &node);
+ if (status)
+ break;
+
+ for (i = 0; i <= ms_info.mi_maxid; i++) {
+ if (!get_nodemask_bit(nodemask, i))
+ continue;
+ micscif_update_p2p_state(i, node, SCIFDEV_SLEEPING);
+ }
+ }
+ break;
+ }
+ case DISCONN_TYPE_LOST_NODE:
+ {
+ atomic_long_add(1, &ms_info.mi_unique_msgid);
+ atomic_set(&mic_ctx->disconn_rescnt, 0);
+
+ for (i = 0; ((i <= ms_info.mi_maxid) && (i != node_id)); i++) {
+ ret = (int)micscif_send_lost_node_rmnode_msg(i, node_id);
+ if(!ret)
+ msg_cnt++;
+ if((ret == 0)||(ret == -ENODEV)) {
+ status = 0;
+ }
+ }
+
+ ret = wait_event_timeout(ms_info.mi_disconn_wq,
+ (atomic_read(&mic_ctx->disconn_rescnt) == msg_cnt),
+ NODE_ALIVE_TIMEOUT);
+ break;
+ }
+ default:
+ status = -EINVAL;
+ }
+
+ return status;
+}
+
+/**
+ * micscif_node_connect:
+ *
+ * @node_id[in]: node to wakeup.
+ * @bool get_ref[in]: Also get node reference after wakeup by incrementing the PM reference count
+ *
+ * Method responsible for connecting node into the scif network. considers dependencies with other node.
+ * finds out activation set. connects all the depenendent nodes in the activation set
+ * returns error if node can not be connected from the network.
+ */
+int
+micscif_connect_node(uint32_t node_id, bool get_ref)
+{
+ return do_idlestate_exit(get_per_dev_ctx(node_id - 1), get_ref);
+}
+
+uint64_t micscif_send_node_alive(int node)
+{
+ struct nodemsg alive_msg;
+ struct micscif_dev *dev = &scif_dev[node];
+ int err;
+
+ alive_msg.uop = SCIF_NODE_ALIVE;
+ alive_msg.src.node = ms_info.mi_nodeid;
+ alive_msg.dst.node = node;
+ pr_debug("node alive msg sent to node %d\n", node);
+ micscif_inc_node_refcnt(dev, 1);
+ err = micscif_nodeqp_send(dev, &alive_msg, NULL);
+ micscif_dec_node_refcnt(dev, 1);
+ return err;
+}
+
+int micscif_handle_lostnode(uint32_t node_id)
+{
+ mic_ctx_t *mic_ctx;
+ uint32_t status = -EOPNOTSUPP;
+#ifdef MM_HANDLER_ENABLE
+ uint8_t *mmio_va;
+ sbox_scratch1_reg_t scratch1reg = {0};
+#endif
+
+ printk("%s %d node %d\n", __func__, __LINE__, node_id);
+ mic_ctx = get_per_dev_ctx(node_id - 1);
+
+ if (mic_ctx->state != MIC_ONLINE && mic_ctx->state != MIC_SHUTDOWN)
+ return 0;
+
+ if (mic_crash_dump_enabled) {
+ if (!(status = vmcore_create(mic_ctx)))
+ printk("%s %d node %d ready for crash dump!\n",
+ __func__, __LINE__, node_id);
+ else
+ printk(KERN_ERR "%s %d node %d crash dump failed status %d\n",
+ __func__, __LINE__, node_id, status);
+ }
+
+ mic_ctx->crash_count++;
+ mutex_lock(&mic_ctx->state_lock);
+ if (mic_ctx->state == MIC_ONLINE ||
+ mic_ctx->state == MIC_SHUTDOWN)
+ mic_setstate(mic_ctx, MIC_LOST);
+ mutex_unlock(&mic_ctx->state_lock);
+
+ /* mpssd will handle core dump and reset/auto reboot */
+ if (mic_crash_dump_enabled && !status)
+ return status;
+
+ printk("%s %d stopping node %d to recover lost node!\n",
+ __func__, __LINE__, node_id);
+ status = adapter_stop_device(mic_ctx, 1, !RESET_REATTEMPT);
+ wait_for_reset(mic_ctx);
+
+ if (!ms_info.mi_watchdog_auto_reboot) {
+ printk("%s %d cannot boot node %d to recover lost node since auto_reboot is off\n",
+ __func__, __LINE__, node_id);
+ return status;
+ }
+
+/* Disabling MM handler invocation till it is ready to handle errors
+ * till then we just reboot the card
+ */
+#ifdef MM_HANDLER_ENABLE
+ mmio_va = mic_ctx->mmio.va;
+ scratch1reg.bits.status = FLASH_CMD_INVALID;
+
+ if(mic_ctx->bi_family == FAMILY_ABR) {
+ printk("Node %d lost. Cannot recover in KNF\n", node_id);
+ status = adapter_start_device(mic_ctx);
+ return status;
+ }
+
+ printk("Booting maintenance mode handler\n");
+ status = set_card_usage_mode(mic_ctx, USAGE_MODE_MAINTENANCE, NULL, 0);
+ if(status) {
+ printk("Unable to boot maintenance mode\n");
+ return status;
+ }
+
+ status = send_flash_cmd(mic_ctx, RAS_CMD, NULL, 0);
+ if(status) {
+ printk("Unable to recover node\n");
+ return status;
+ }
+ while(scratch1reg.bits.status != FLASH_CMD_COMPLETED) {
+ ret = SBOX_READ(mmio_va, SBOX_SCRATCH1);
+ scratch1reg.value = ret;
+ msleep(1);
+ i++;
+ printk("Looping for status (time = %d ms)\n", i);
+ if(i > NODE_ALIVE_TIMEOUT) {
+ status = -ETIME;
+ printk("Unable to recover node. Status bit is : %d\n",
+ scratch1reg.bits.status);
+ return status;
+ }
+
+ }
+#endif
+ printk("%s %d booting node %d to recover lost node!\n",
+ __func__, __LINE__, node_id);
+ status = adapter_start_device(mic_ctx);
+ return status;
+}
+
+void micscif_watchdog_handler(struct work_struct *work)
+{
+ struct micscif_dev *dev =
+ container_of(to_delayed_work(work),
+ struct micscif_dev, sd_watchdog_work);
+ struct _mic_ctx_t *mic_ctx;
+ int i = dev->sd_node, err, ret;
+
+ mic_ctx = get_per_dev_ctx(i - 1);
+
+ switch (mic_ctx->sdbic1) {
+ case SYSTEM_HALT:
+ case SYSTEM_POWER_OFF:
+ {
+ adapter_stop_device(mic_ctx, 1, !RESET_REATTEMPT);
+ wait_for_reset(mic_ctx);
+ mic_ctx->sdbic1 = 0;
+ break;
+ }
+ case SYSTEM_RESTART:
+ {
+ mic_setstate(mic_ctx, MIC_LOST);
+ mic_ctx->sdbic1 = 0;
+ break;
+ }
+ case SYSTEM_BOOTING:
+ case SYSTEM_RUNNING:
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0))
+ case SYSTEM_SUSPEND_DISK:
+#endif
+ break;
+ case 0xdead:
+ if (mic_crash_dump_enabled)
+ micscif_handle_lostnode(i);
+ mic_ctx->sdbic1 = 0;
+ break;
+ default:
+ break;
+ }
+
+ switch (mic_ctx->state) {
+ case MIC_ONLINE:
+ break;
+ case MIC_BOOT:
+ goto restart_timer;
+ case MIC_SHUTDOWN:
+ case MIC_LOST:
+ case MIC_READY:
+ case MIC_NORESPONSE:
+ case MIC_BOOTFAIL:
+ case MIC_RESET:
+ case MIC_RESETFAIL:
+ case MIC_INVALID:
+ return;
+ }
+
+ if (!ms_info.mi_watchdog_enabled)
+ return;
+
+ err = micpm_get_reference(mic_ctx, false);
+ if (err == -EAGAIN) {
+ goto restart_timer;
+ } else if (err == -ENODEV) {
+ micscif_handle_lostnode(i);
+ goto restart_timer;
+ }
+
+ if (1 != atomic_cmpxchg(&dev->sd_node_alive, 1, 0)) {
+
+ err = (int)(micscif_send_node_alive(i));
+
+ if (err) {
+ micpm_put_reference(mic_ctx);
+ goto restart_timer;
+ }
+
+ ret = wait_event_timeout(dev->sd_watchdog_wq,
+ (atomic_cmpxchg(&dev->sd_node_alive, 1, 0) == 1),
+ NODE_ALIVE_TIMEOUT);
+ if (!ret || err)
+ micscif_handle_lostnode(i);
+ }
+ micpm_put_reference(mic_ctx);
+
+restart_timer:
+ if (dev->sd_ln_wq)
+ queue_delayed_work(dev->sd_ln_wq,
+ &dev->sd_watchdog_work, NODE_ALIVE_TIMEOUT);
+}
+#else
+
+long micscif_suspend(uint8_t* nodemask) {
+ long ret = 0;
+ int i;
+ struct micscif_dev *dev;
+
+ for (i = 0; i <= ms_info.mi_maxid; i++) {
+ if (get_nodemask_bit(nodemask , i)) {
+ dev = &scif_dev[i];
+ if (SCIFDEV_RUNNING != dev->sd_state)
+ continue;
+
+ ret = atomic_long_cmpxchg(
+ &dev->scif_ref_cnt, 0, SCIF_NODE_IDLE);
+ if (!ret || ret == SCIF_NODE_IDLE) {
+ dev->sd_state = SCIFDEV_SLEEPING;
+ ret = 0;
+ }
+ else {
+ set_nodemask_bit(nodemask, i, 0);
+ ret = EAGAIN;
+ }
+ }
+ }
+ return ret;
+}
+/*
+ * scif_suspend_handler - SCIF tasks before transition to low power state.
+ */
+int micscif_suspend_handler(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ int ret = 0;
+#ifdef SCIF_ENABLE_PM
+ int node = 0;
+ int size;
+ uint8_t *nodemask_buf;
+
+ size = ((int) ((ms_info.mi_maxid + 1) / 8) +
+ (((ms_info.mi_maxid + 1) % 8) ? 1 : 0));
+ nodemask_buf = (uint8_t*)kzalloc(size, GFP_ATOMIC);
+ if(!nodemask_buf)
+ return -ENOMEM;
+
+ for (node = 0; node <= ms_info.mi_maxid; node++) {
+ if ((node != SCIF_HOST_NODE) && (node != ms_info.mi_nodeid))
+ set_nodemask_bit(nodemask_buf, node, 1);
+ }
+
+ if (micscif_suspend(nodemask_buf)){
+ ret = -EBUSY;
+ goto clean_up;
+ }
+
+ dma_suspend(mic_dma_handle);
+clean_up:
+ kfree(nodemask_buf);
+#endif
+ return ret;
+}
+
+/*
+ * micscif_resume_handler - SCIF tasks after wake up from low power state.
+ */
+int micscif_resume_handler(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+#ifdef SCIF_ENABLE_PM
+#ifdef _MIC_SCIF_
+ queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work);
+#endif
+ dma_resume(mic_dma_handle);
+#endif
+ return 0;
+}
+
+/*
+ * scif_fail_suspend_handler - SCIF tasks if a previous scif_suspend call has
+ * failed since a low power state transition could not be completed.
+ */
+int micscif_fail_suspend_handler(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+/* Stub out function since it is an optimization that isn't working properly */
+#if 0
+#ifdef SCIF_ENABLE_PM
+ int node = 0;
+ long ret;
+ struct micscif_dev *dev;
+
+ for (node = 0; node <= ms_info.mi_maxid; node++) {
+ dev = &scif_dev[node];
+ ret = atomic_long_cmpxchg(&dev->scif_ref_cnt, SCIF_NODE_IDLE, 0);
+ if (ret != SCIF_NODE_IDLE)
+ continue;
+ if (SCIFDEV_SLEEPING == dev->sd_state)
+ dev->sd_state = SCIFDEV_RUNNING;
+ }
+#endif
+#endif
+ return 0;
+}
+
+void micscif_get_node_info(void)
+{
+ struct nodemsg msg;
+ struct get_node_info node_info;
+
+ init_waitqueue_head(&node_info.wq);
+ node_info.state = OP_IN_PROGRESS;
+ micscif_inc_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1);
+ msg.uop = SCIF_GET_NODE_INFO;
+ msg.src.node = ms_info.mi_nodeid;
+ msg.dst.node = SCIF_HOST_NODE;
+ msg.payload[3] = (uint64_t)&node_info;
+
+ if ((micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], &msg, NULL)))
+ goto done;
+
+ wait_event(node_info.wq, node_info.state != OP_IN_PROGRESS);
+done:
+ micscif_dec_node_refcnt(&scif_dev[SCIF_HOST_NODE], 1);
+ /* Synchronize with the thread waking us up */
+ mutex_lock(&ms_info.mi_conflock);
+ mutex_unlock(&ms_info.mi_conflock);
+ ;
+}
+#endif /* _MIC_SCIF_ */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "mic/micscif.h"
+#include "mic/micscif_smpt.h"
+#include "mic/micscif_nodeqp.h"
+#include "mic/micscif_intr.h"
+#include "mic/micscif_nm.h"
+#include "mic_common.h"
+#include "mic/micscif_map.h"
+
+#define SBOX_MMIO_LENGTH 0x10000
+/* FIXME: HW spefic, define someplace else */
+/* SBOX Offset in MMIO space */
+#define SBOX_OFFSET 0x10000
+
+#ifdef ENABLE_TEST
+static void micscif_qp_testboth(struct micscif_dev *scifdev);
+#endif
+
+bool mic_p2p_enable = 1;
+bool mic_p2p_proxy_enable = 1;
+
+void micscif_teardown_ep(void *endpt)
+{
+ struct endpt *ep = (struct endpt *)endpt;
+ struct micscif_qp *qp = ep->qp_info.qp;
+ if (qp) {
+ if (qp->outbound_q.rb_base)
+ scif_iounmap((void *)qp->outbound_q.rb_base,
+ qp->outbound_q.size, ep->remote_dev);
+ if (qp->remote_qp)
+ scif_iounmap((void *)qp->remote_qp,
+ sizeof(struct micscif_qp), ep->remote_dev);
+ if (qp->local_buf) {
+ unmap_from_aperture(
+ qp->local_buf,
+ ep->remote_dev, ENDPT_QP_SIZE);
+ }
+ if (qp->local_qp) {
+ unmap_from_aperture(qp->local_qp, ep->remote_dev,
+ sizeof(struct micscif_qp));
+ }
+ if (qp->inbound_q.rb_base)
+ kfree((void *)qp->inbound_q.rb_base);
+ kfree(qp);
+#ifdef _MIC_SCIF_
+ micscif_teardown_proxy_dma(endpt);
+#endif
+ WARN_ON(!list_empty(&ep->rma_info.task_list));
+ }
+}
+
+/*
+ * Enqueue the endpoint to the zombie list for cleanup.
+ * The endpoint should not be accessed once this API returns.
+ */
+void micscif_add_epd_to_zombie_list(struct endpt *ep, bool mi_eplock_held)
+{
+ unsigned long sflags = 0;
+
+ /*
+ * It is an error to call scif_close() on an endpoint on which a
+ * scif_range structure of that endpoint has not been returned
+ * after a call to scif_get_pages() via scif_put_pages().
+ */
+ if (SCIFEP_CLOSING == ep->state ||
+ SCIFEP_CLOSED == ep->state ||
+ SCIFEP_DISCONNECTED == ep->state)
+ BUG_ON(micscif_rma_list_get_pages_check(ep));
+
+ if (list_empty(&ep->rma_info.task_list) && ep->remote_dev)
+ wake_up(&ep->remote_dev->sd_mmap_wq);
+ if (!mi_eplock_held)
+ spin_lock_irqsave(&ms_info.mi_eplock, sflags);
+ spin_lock(&ep->lock);
+ ep->state = SCIFEP_ZOMBIE;
+ spin_unlock(&ep->lock);
+ list_add_tail(&ep->list, &ms_info.mi_zombie);
+ ms_info.mi_nr_zombies++;
+ if (!mi_eplock_held)
+ spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
+ queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work);
+}
+
+/* Initializes "local" data structures for the QP
+ *
+ * Allocates the QP ring buffer (rb), initializes the "in bound" queue
+ * For the host generate bus addresses for QP rb & qp, in the card's case
+ * map these into the pci aperture
+ */
+int micscif_setup_qp_connect(struct micscif_qp *qp, dma_addr_t *qp_offset,
+ int local_size, struct micscif_dev *scifdev)
+{
+ void *local_q = NULL;
+ int err = 0;
+ volatile uint32_t tmp_rd;
+
+ spin_lock_init(&qp->qp_send_lock);
+ spin_lock_init(&qp->qp_recv_lock);
+
+ if (!qp->inbound_q.rb_base) {
+ /* we need to allocate the local buffer for the incoming queue */
+ local_q = kzalloc(local_size, GFP_ATOMIC);
+ if (!local_q) {
+ printk(KERN_ERR "Ring Buffer Allocation Failed\n");
+ err = -ENOMEM;
+ return err;
+ }
+ /* to setup the inbound_q, the buffer lives locally (local_q),
+ * the read pointer is remote (in remote_qp's local_read)
+ * the write pointer is local (in local_write)
+ */
+ tmp_rd = 0;
+ micscif_rb_init(&qp->inbound_q,
+ &tmp_rd, /* No read ptr right now ... */
+ &(scifdev->qpairs[0].local_write),
+ (volatile void *) local_q,
+ local_size);
+ qp->inbound_q.read_ptr = NULL; /* it is unsafe to use the ring buffer until this changes! */
+ }
+
+ if (!qp->local_buf) {
+ err = map_virt_into_aperture(&qp->local_buf, local_q, scifdev, local_size);
+ if (err) {
+ printk(KERN_ERR "%s %d error %d\n",
+ __func__, __LINE__, err);
+ return err;
+ }
+ }
+
+ if (!qp->local_qp) {
+ err = map_virt_into_aperture(qp_offset, qp, scifdev, sizeof(struct micscif_qp));
+ if (err) {
+ printk(KERN_ERR "%s %d error %d\n",
+ __func__, __LINE__, err);
+ return err;
+ }
+ qp->local_qp = *qp_offset;
+ } else {
+ *qp_offset = qp->local_qp;
+ }
+ return err;
+}
+
+/* When the other side has already done it's allocation, this is called */
+/* TODO: Replace reads that go across the bus somehow ... */
+int micscif_setup_qp_accept(struct micscif_qp *qp, dma_addr_t *qp_offset, dma_addr_t phys, int local_size, struct micscif_dev *scifdev)
+{
+ void *local_q;
+ volatile void *remote_q;
+ struct micscif_qp *remote_qp;
+ int remote_size;
+ int err = 0;
+
+ spin_lock_init(&qp->qp_send_lock);
+ spin_lock_init(&qp->qp_recv_lock);
+ /* Start by figuring out where we need to point */
+ remote_qp = scif_ioremap(phys, sizeof(struct micscif_qp), scifdev);
+ qp->remote_qp = remote_qp;
+ qp->remote_buf = remote_qp->local_buf;
+ /* To setup the outbound_q, the buffer lives in remote memory (at scifdev->bs->buf phys),
+ * the read pointer is local (in local's local_read)
+ * the write pointer is remote (In remote_qp's local_write)
+ */
+ remote_size = qp->remote_qp->inbound_q.size; /* TODO: Remove this read for p2p */
+ remote_q = scif_ioremap(qp->remote_buf, remote_size, scifdev);
+
+ BUG_ON(qp->remote_qp->magic != SCIFEP_MAGIC);
+
+ qp->remote_qp->local_write = 0;
+ micscif_rb_init(&(qp->outbound_q),
+ &(qp->local_read), /*read ptr*/
+ &(qp->remote_qp->local_write), /*write ptr*/
+ remote_q, /*rb_base*/
+ remote_size);
+ /* to setup the inbound_q, the buffer lives locally (local_q),
+ * the read pointer is remote (in remote_qp's local_read)
+ * the write pointer is local (in local_write)
+ */
+ local_q = kzalloc(local_size, GFP_KERNEL);
+ if (!local_q) {
+ printk(KERN_ERR "Ring Buffer Allocation Failed\n");
+ err = -ENOMEM;
+ return err;
+ }
+
+ qp->remote_qp->local_read = 0;
+ micscif_rb_init(&(qp->inbound_q),
+ &(qp->remote_qp->local_read),
+ &(qp->local_write),
+ local_q,
+ local_size);
+ err = map_virt_into_aperture(&qp->local_buf, local_q, scifdev, local_size);
+ if (err) {
+ printk(KERN_ERR "%s %d error %d\n",
+ __func__, __LINE__, err);
+ return err;
+ }
+ err = map_virt_into_aperture(qp_offset, qp, scifdev, sizeof(struct micscif_qp));
+ if (err) {
+ printk(KERN_ERR "%s %d error %d\n",
+ __func__, __LINE__, err);
+ return err;
+ }
+ qp->local_qp = *qp_offset;
+ return err;
+}
+
+int micscif_setup_qp_connect_response(struct micscif_dev *scifdev, struct micscif_qp *qp, uint64_t payload)
+{
+ int err = 0;
+ void *r_buf;
+ int remote_size;
+ phys_addr_t tmp_phys;
+
+ qp->remote_qp = scif_ioremap(payload, sizeof(struct micscif_qp), scifdev);
+
+ if (!qp->remote_qp) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ if (qp->remote_qp->magic != SCIFEP_MAGIC) {
+ printk(KERN_ERR "SCIFEP_MAGIC doesnot match between node %d "
+ "(self) and %d (remote)\n", scif_dev[ms_info.mi_nodeid].sd_node,
+ scifdev->sd_node);
+ WARN_ON(1);
+ err = -ENODEV;
+ goto error;
+ }
+
+ tmp_phys = readq(&(qp->remote_qp->local_buf));
+ remote_size = readl(&qp->remote_qp->inbound_q.size);
+ r_buf = scif_ioremap(tmp_phys, remote_size, scifdev);
+
+#if 0
+ pr_debug("payload = 0x%llx remote_qp = 0x%p tmp_phys=0x%llx \
+ remote_size=%d r_buf=%p\n", payload, qp->remote_qp,
+ tmp_phys, remote_size, r_buf);
+#endif
+
+ micscif_rb_init(&(qp->outbound_q),
+ &(qp->local_read),
+ &(qp->remote_qp->local_write),
+ r_buf,
+ remote_size);
+ /* resetup the inbound_q now that we know where the inbound_read really is */
+ micscif_rb_init(&(qp->inbound_q),
+ &(qp->remote_qp->local_read),
+ &(qp->local_write),
+ qp->inbound_q.rb_base,
+ qp->inbound_q.size);
+error:
+ return err;
+}
+
+#ifdef _MIC_SCIF_
+extern int micscif_send_host_intr(struct micscif_dev *, uint32_t);
+
+int micscif_send_host_intr(struct micscif_dev *dev, uint32_t doorbell)
+{
+ uint32_t db_reg;
+
+ if (doorbell > 3)
+ return -EINVAL;
+
+ db_reg = readl(dev->mm_sbox +
+ (SBOX_SDBIC0 + (4 * doorbell))) | SBOX_SDBIC0_DBREQ_BIT;
+ writel(db_reg, dev->mm_sbox + (SBOX_SDBIC0 + (4 * doorbell)));
+ return 0;
+}
+#endif
+
+/*
+ * Interrupts remote mic
+ */
+static void
+micscif_send_mic_intr(struct micscif_dev *dev)
+{
+ /* Writes to RDMASR triggers the interrupt */
+ writel(0, (uint8_t *)dev->mm_sbox + dev->sd_rdmasr);
+}
+
+/* scifdev - remote scif device
+ * also needs the local scif device so that we can decide which RMASR
+ * to target on the remote mic
+ */
+static __always_inline void
+scif_send_msg_intr(struct micscif_dev *scifdev)
+{
+#ifdef _MIC_SCIF_
+ if (scifdev == &scif_dev[0])
+ micscif_send_host_intr(scifdev, 0);
+ else
+#endif
+ micscif_send_mic_intr(scifdev);
+}
+
+#ifdef _MIC_SCIF_
+int micscif_setup_card_qp(phys_addr_t host_phys, struct micscif_dev *scifdev)
+{
+ int local_size;
+ dma_addr_t qp_offset;
+ int err = 0;
+ struct nodemsg tmp_msg;
+ uint16_t host_scif_ver;
+
+ pr_debug("Got 0x%llx from the host\n", host_phys);
+
+ local_size = NODE_QP_SIZE;
+
+ /* FIXME: n_qpairs is always 1 OK to get rid of it ? */
+ scifdev->n_qpairs = 1;
+ scifdev->qpairs = kzalloc(sizeof(struct micscif_qp), GFP_KERNEL);
+ if (!scifdev->qpairs) {
+ printk(KERN_ERR "Node QP Allocation failed\n");
+ err = -ENOMEM;
+ return err;
+ }
+
+ scifdev->qpairs->magic = SCIFEP_MAGIC;
+ pr_debug("micscif_card(): called qp_accept\n");
+ err = micscif_setup_qp_accept(&scifdev->qpairs[0], &qp_offset, host_phys, local_size, scifdev);
+
+ if (!err) {
+ host_scif_ver = readw(&(&scifdev->qpairs[0])->remote_qp->scif_version);
+ if (host_scif_ver != SCIF_VERSION) {
+ printk(KERN_ERR "Card and host SCIF versions do not match. \n");
+ printk(KERN_ERR "Card version: %u, Host version: %u \n",
+ SCIF_VERSION, host_scif_ver);
+ err = -ENXIO;
+ goto error_exit;
+ }
+ /* now that everything is setup and mapped, we're ready to tell the
+ * host where our queue's location
+ */
+ tmp_msg.uop = SCIF_INIT;
+ tmp_msg.payload[0] = qp_offset;
+ tmp_msg.payload[1] = get_rdmasr_offset(scifdev->sd_intr_handle);
+ tmp_msg.dst.node = 0; /* host */
+
+ pr_debug("micscif_setup_card_qp: micscif_setup_qp_accept, INIT message\n");
+ err = micscif_nodeqp_send(scifdev, &tmp_msg, NULL);
+ }
+error_exit:
+ if (err)
+ printk(KERN_ERR "%s %d error %d\n",
+ __func__, __LINE__, err);
+ return err;
+}
+
+
+void micscif_send_exit(void)
+{
+ struct nodemsg msg;
+ struct micscif_dev *scifdev = &scif_dev[SCIF_HOST_NODE];
+
+ init_waitqueue_head(&ms_info.mi_exitwq);
+
+ msg.uop = SCIF_EXIT;
+ msg.src.node = ms_info.mi_nodeid;
+ msg.dst.node = scifdev->sd_node;
+ /* No error handling for Host SCIF device */
+ micscif_nodeqp_send(scifdev, &msg, NULL);
+}
+
+#else /* !_MIC_SCIF_ */
+static uint32_t tmp_r_ptr;
+int micscif_setup_host_qp(mic_ctx_t *mic_ctx, struct micscif_dev *scifdev)
+{
+ int err = 0;
+ int local_size;
+
+ /* Bail out if the node QP is already setup */
+ if (scifdev->qpairs)
+ return err;
+
+ local_size = NODE_QP_SIZE;
+
+ /* for now, assume that we only have one queue-pair -- with the host */
+ scifdev->n_qpairs = 1;
+ scifdev->qpairs = (struct micscif_qp *)kzalloc(sizeof(struct micscif_qp), GFP_ATOMIC);
+ if (!scifdev->qpairs) {
+ printk(KERN_ERR "Node QP Allocation failed\n");
+ err = -ENOMEM;
+ return err;
+ }
+
+ scifdev->qpairs->magic = SCIFEP_MAGIC;
+ scifdev->qpairs->scif_version = SCIF_VERSION;
+ err = micscif_setup_qp_connect(&scifdev->qpairs[0], &(mic_ctx->bi_scif.si_pa), local_size, scifdev);
+ /* fake the read pointer setup so we can use the inbound q */
+ scifdev->qpairs[0].inbound_q.read_ptr = &tmp_r_ptr;
+
+ /* We're as setup as we can be ... the inbound_q is setup, w/o
+ * a usable outbound q. When we get a message, the read_ptr will
+ * be updated, so we know there's something here. When that happens,
+ * we finish the setup (just point the write pointer to the real
+ * write pointer that lives on the card), and pull the message off
+ * the card.
+ * Tell the card where we are.
+ */
+ printk("My Phys addrs: 0x%llx and scif_addr 0x%llx\n", scifdev->qpairs[0].local_buf,
+ mic_ctx->bi_scif.si_pa);
+
+ if (err) printk(KERN_ERR "%s %d error %d\n", __func__, __LINE__, err);
+ return err;
+}
+
+
+/* FIXME: add to header */
+struct scatterlist * micscif_p2p_mapsg(void *va, int page_size, int page_cnt);
+void micscif_p2p_freesg(struct scatterlist *);
+mic_ctx_t* get_per_dev_ctx(uint16_t node);
+
+/* Init p2p mappings required to access peerdev from scifdev */
+static struct scif_p2p_info *
+init_p2p_info(struct micscif_dev *scifdev, struct micscif_dev *peerdev)
+{
+ struct _mic_ctx_t *mic_ctx_peer;
+ struct _mic_ctx_t *mic_ctx;
+ struct scif_p2p_info *p2p;
+ int num_mmio_pages;
+ int num_aper_pages;
+
+ mic_ctx = get_per_dev_ctx(scifdev->sd_node - 1);
+ mic_ctx_peer = get_per_dev_ctx(peerdev->sd_node - 1);
+
+ num_mmio_pages = (int) (mic_ctx_peer->mmio.len >> PAGE_SHIFT);
+ num_aper_pages = (int) (mic_ctx_peer->aper.len >> PAGE_SHIFT);
+
+ // First map the peer board addresses into the new board
+ p2p = kzalloc(sizeof(struct scif_p2p_info), GFP_KERNEL);
+
+ if (p2p){
+ int sg_page_shift = get_order(min(mic_ctx_peer->aper.len,(uint64_t)(1 << 30)));
+ /* FIXME: check return codes below */
+ p2p->ppi_sg[PPI_MMIO] = micscif_p2p_mapsg(mic_ctx_peer->mmio.va, PAGE_SIZE,
+ num_mmio_pages);
+ p2p->sg_nentries[PPI_MMIO] = num_mmio_pages;
+ p2p->ppi_sg[PPI_APER] = micscif_p2p_mapsg(mic_ctx_peer->aper.va, 1 << sg_page_shift,
+ num_aper_pages >> (sg_page_shift - PAGE_SHIFT));
+ p2p->sg_nentries[PPI_APER] = num_aper_pages >> (sg_page_shift - PAGE_SHIFT);
+
+ pci_map_sg(mic_ctx->bi_pdev, p2p->ppi_sg[PPI_MMIO], num_mmio_pages, PCI_DMA_BIDIRECTIONAL);
+ pci_map_sg(mic_ctx->bi_pdev, p2p->ppi_sg[PPI_APER],
+ num_aper_pages >> (sg_page_shift - PAGE_SHIFT), PCI_DMA_BIDIRECTIONAL);
+
+ p2p->ppi_pa[PPI_MMIO] = sg_dma_address(p2p->ppi_sg[PPI_MMIO]);
+ p2p->ppi_pa[PPI_APER] = sg_dma_address(p2p->ppi_sg[PPI_APER]);
+ p2p->ppi_len[PPI_MMIO] = num_mmio_pages;
+ p2p->ppi_len[PPI_APER] = num_aper_pages;
+ p2p->ppi_disc_state = SCIFDEV_RUNNING;
+ p2p->ppi_peer_id = peerdev->sd_node;
+
+ }
+ return (p2p);
+}
+
+
+int micscif_setuphost_response(struct micscif_dev *scifdev, uint64_t payload)
+{
+ int read_size;
+ struct nodemsg msg;
+ int err = 0;
+
+ pr_debug("micscif_setuphost_response: scif node %d\n", scifdev->sd_node);
+ err = micscif_setup_qp_connect_response(scifdev, &scifdev->qpairs[0], payload);
+ if (err) {
+ printk(KERN_ERR "%s %d error %d\n", __func__, __LINE__, err);
+ return err;
+ }
+ /* re-recieve the bootstrap message after re-init call */
+ pr_debug("micscif_host(): reading INIT message after re-init call\n");
+ read_size = micscif_rb_get_next(&(scifdev->qpairs[0].inbound_q), &msg,
+ sizeof(struct nodemsg));
+ micscif_rb_update_read_ptr(&(scifdev->qpairs[0].inbound_q));
+
+ scifdev->sd_rdmasr = (uint32_t)msg.payload[1];
+
+ /* for testing, send a message back to the card */
+ msg.uop = SCIF_INIT;
+ msg.payload[0] = 0xdeadbeef;
+ msg.dst.node = scifdev->sd_node; /* card */
+ if ((err = micscif_nodeqp_send(scifdev, &msg, NULL))) {
+ printk(KERN_ERR "%s %d error %d\n", __func__, __LINE__, err);
+ return err;
+ }
+
+#ifdef ENABLE_TEST
+ /* Launch the micscif_rb test */
+ pr_debug("micscif_host(): starting TEST\n");
+ micscif_qp_testboth(scifdev);
+#endif
+
+ /*
+ * micscif_nodeqp_intrhandler(..) increments the ref_count before calling
+ * this API hence clamp the scif_ref_cnt to 1. This is required to
+ * handle the SCIF module load/unload case on MIC. The SCIF_EXIT message
+ * keeps the ref_cnt clamped to SCIF_NODE_IDLE during module unload.
+ * Setting the ref_cnt to 1 during SCIF_INIT ensures that the ref_cnt
+ * returns back to 0 once SCIF module load completes.
+ */
+#ifdef SCIF_ENABLE_PM
+ scifdev->scif_ref_cnt = (atomic_long_t) ATOMIC_LONG_INIT(1);
+#endif
+ mutex_lock(&ms_info.mi_conflock);
+ ms_info.mi_mask |= 0x1 << scifdev->sd_node;
+ ms_info.mi_maxid = SCIF_MAX(scifdev->sd_node, ms_info.mi_maxid);
+ ms_info.mi_total++;
+ scifdev->sd_state = SCIFDEV_RUNNING;
+ mutex_unlock(&ms_info.mi_conflock);
+
+ micscif_node_add_callback(scifdev->sd_node);
+ return err;
+}
+
+void
+micscif_removehost_respose(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(scifdev->sd_node -1);
+ int err;
+
+ if (scifdev->sd_state != SCIFDEV_RUNNING)
+ return;
+
+ micscif_stop(mic_ctx);
+
+ if ((err = micscif_nodeqp_send(scifdev, msg, NULL)))
+ printk(KERN_ERR "%s %d error %d\n", __func__, __LINE__, err);
+
+ scifdev->sd_state = SCIFDEV_INIT;
+}
+#endif
+
+/* TODO: Fix the non-symmetric use of micscif_dev on the host and the card. Right
+ * now, the card's data structures are shaping up such that there is a single
+ * micscif_dev structure with multiple qp's. The host ends up with multiple
+ * micscif_devs (one per card). We should unify the way this will work.
+ */
+static struct micscif_qp *micscif_nodeqp_find(struct micscif_dev *scifdev, uint8_t node)
+{
+ struct micscif_qp *qp = NULL;
+#ifdef _MIC_SCIF_
+ /* This is also a HACK. Even though the code is identical with the host right
+ * now, I broke it into two parts because they will likely not be identical
+ * moving forward
+ */
+ qp = scifdev->qpairs;
+#else
+ /* HORRIBLE HACK! Since we only have one card, and one scifdev, we
+ * can just grab the scifdev->qp to find the qp. We don't actually have to
+ * do any kind of looking for it
+ */
+ qp = scifdev->qpairs;
+#endif /* !_MIC_SCIF_ */
+ return qp;
+}
+
+static char *scifdev_state[] = {"SCIFDEV_NOTPRESENT",
+ "SCIFDEV_INIT",
+ "SCIFDEV_RUNNING",
+ "SCIFDEV_SLEEPING",
+ "SCIFDEV_STOPPING",
+ "SCIFDEV_STOPPED"};
+
+static char *message_types[] = {"BAD",
+ "INIT",
+ "EXIT",
+ "SCIF_NODE_ADD",
+ "SCIF_NODE_ADD_ACK",
+ "CNCT_REQ",
+ "CNCT_GNT",
+ "CNCT_GNTACK",
+ "CNCT_GNTNACK",
+ "CNCT_REJ",
+ "CNCT_TERM",
+ "TERM_ACK",
+ "DISCNCT",
+ "DISCNT_ACK",
+ "REGISTER",
+ "REGISTER_ACK",
+ "REGISTER_NACK",
+ "UNREGISTER",
+ "UNREGISTER_ACK",
+ "UNREGISTER_NACK",
+ "ALLOC_REQ",
+ "ALLOC_GNT",
+ "ALLOC_REJ",
+ "FREE_PHYS",
+ "FREE_VIRT",
+ "CLIENT_SENT",
+ "CLIENT_RCVD",
+ "MUNMAP",
+ "MARK",
+ "MARK_ACK",
+ "MARK_NACK",
+ "WAIT",
+ "WAIT_ACK",
+ "WAIT_NACK",
+ "SIGNAL_LOCAL",
+ "SIGNAL_REMOTE",
+ "SIG_ACK",
+ "SIG_NACK",
+ "MAP_GTT",
+ "MAP_GTT_ACK",
+ "MAP_GTT_NACK",
+ "UNMAP_GTT",
+ "CREATE_NODE_DEP",
+ "DESTROY_NODE_DEP",
+ "REMOVE_NODE",
+ "REMOVE_NODE_ACK",
+ "WAKE_UP_NODE",
+ "WAKE_UP_NODE_ACK",
+ "WAKE_UP_NODE_NACK",
+ "SCIF_NODE_ALIVE",
+ "SCIF_NODE_ALIVE_ACK",
+ "SCIF_SMPT",
+ "SCIF_GTT_DMA_MAP",
+ "SCIF_GTT_DMA_ACK",
+ "SCIF_GTT_DMA_NACK",
+ "SCIF_GTT_DMA_UNMAP",
+ "SCIF_PROXY_DMA",
+ "SCIF_PROXY_ORDERED_DMA",
+ "SCIF_NODE_CONNECT",
+ "SCIF_NODE_CONNECT_NACK",
+ "SCIF_NODE_ADD_NACK",
+ "SCIF_GET_NODE_INFO",
+ "TEST"};
+
+static void
+micscif_display_message(struct micscif_dev *scifdev, struct nodemsg *msg,
+ const char *label)
+{
+ if (!ms_info.en_msg_log)
+ return;
+ if (msg->uop > SCIF_MAX_MSG) {
+ pr_debug("%s: unknown msg type %d\n", label, msg->uop);
+ return;
+ }
+ if (msg->uop == SCIF_TEST)
+ return;
+
+ printk("%s: %s msg type %s, src %d:%d, dest %d:%d "
+ "payload 0x%llx:0x%llx:0x%llx:0x%llx\n",
+ label, scifdev_state[scifdev->sd_state],
+ message_types[msg->uop], msg->src.node, msg->src.port,
+ msg->dst.node, msg->dst.port, msg->payload[0], msg->payload[1],
+ msg->payload[2], msg->payload[3]);
+}
+
+/**
+ * micscif_nodeqp_send - Send a message on the Node Qp.
+ * @scifdev: Scif Device.
+ * @msg: The message to be sent.
+ *
+ * This function will block till a message is not sent to the destination
+ * scif device.
+ */
+int micscif_nodeqp_send(struct micscif_dev *scifdev,
+ struct nodemsg *msg, struct endpt *ep)
+{
+ struct micscif_qp *qp;
+ int err = -ENOMEM, loop_cnt = 0;
+
+ if (oops_in_progress ||
+ (SCIF_INIT != msg->uop &&
+ SCIF_EXIT != msg->uop &&
+ SCIFDEV_RUNNING != scifdev->sd_state &&
+ SCIFDEV_SLEEPING != scifdev->sd_state) ||
+ (ep && SCIFDEV_STOPPED == ep->sd_state)) {
+ err = -ENODEV;
+ goto error;
+ }
+
+ micscif_display_message(scifdev, msg, "Sent");
+
+ qp = micscif_nodeqp_find(scifdev, (uint8_t)msg->dst.node);
+ if (!qp) {
+ err = -EINVAL;
+ goto error;
+ }
+ spin_lock(&qp->qp_send_lock);
+
+ while ((err = micscif_rb_write(&qp->outbound_q,
+ msg, sizeof(struct nodemsg)))) {
+ cpu_relax();
+ mdelay(1);
+ if (loop_cnt++ > (NODEQP_SEND_TO_MSEC)) {
+ err = -ENODEV;
+ break;
+ }
+ }
+ if (!err)
+ micscif_rb_commit(&qp->outbound_q);
+ spin_unlock(&qp->qp_send_lock);
+ if (!err) {
+ if (is_self_scifdev(scifdev))
+ /*
+ * For loopback we need to emulate an interrupt by queueing
+ * work for the queue handling real Node Qp interrupts.
+ */
+
+ queue_work(scifdev->sd_intr_wq, &scifdev->sd_intr_bh);
+ else
+ scif_send_msg_intr(scifdev);
+ }
+error:
+ if (err)
+ pr_debug("%s %d error %d uop %d\n",
+ __func__, __LINE__, err, msg->uop);
+ return err;
+}
+
+/* TODO: Make this actually figure out where the interrupt came from. For host, it can
+ * be a little easier (one "vector" per board). For the cards, we'll have to do some
+ * scanning, methinks
+ */
+struct micscif_qp *micscif_nodeqp_nextmsg(struct micscif_dev *scifdev)
+{
+ return &scifdev->qpairs[0];
+}
+
+/*
+ * micscif_misc_handler:
+ *
+ * Work queue handler for servicing miscellaneous SCIF tasks.
+ * Examples include:
+ * 1) Remote fence requests.
+ * 2) Destruction of temporary registered windows
+ * created during scif_vreadfrom()/scif_vwriteto().
+ * 3) Cleanup of zombie endpoints.
+ */
+void micscif_misc_handler(struct work_struct *work)
+{
+ micscif_rma_handle_remote_fences();
+ micscif_rma_destroy_temp_windows();
+#ifdef _MIC_SCIF_
+ vm_unmap_aliases();
+#endif
+ micscif_rma_destroy_tcw_invalid(&ms_info.mi_rma_tc);
+ micscif_cleanup_zombie_epd();
+}
+
+/**
+ * scif_init_resp() - Respond to SCIF_INIT interrupt message
+ * @scifdev: Other node device to respond to
+ * @msg: Interrupt message
+ *
+ * Loading the driver on the MIC card sends an INIT message to the host
+ * with the PCI bus memory information it needs. This function receives
+ * that message, finishes its intialization and echoes it back to the card.
+ *
+ * When the card receives the message this function starts a connection test.
+ */
+static __always_inline void
+scif_init_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+#ifdef _MIC_SCIF_
+ if (msg->payload[0] != 0xdeadbeef)
+ printk(KERN_ERR "Bad payload 0x%llx\n", msg->payload[0]);
+#ifdef ENABLE_TEST
+ else
+ micscif_qp_testboth(scifdev);
+#endif
+#else
+ pr_debug("scifhost(): sending response to INIT\n");
+ micscif_setuphost_response(scifdev, msg->payload[0]);
+ atomic_set(&scifdev->sd_node_alive, 0);
+ if (scifdev->sd_ln_wq)
+ queue_delayed_work(scifdev->sd_ln_wq,
+ &scifdev->sd_watchdog_work, NODE_ALIVE_TIMEOUT);
+#endif
+}
+
+/**
+ * scif_exit_resp() - Respond to SCIF_EXIT interrupt message
+ * @scifdev: Other node device to respond to
+ * @msg: Interrupt message
+ *
+ * Loading the driver on the MIC card sends an INIT message to the host
+ * with the PCI bus memory information it needs. This function receives
+ * that message, finishes its intialization and echoes it back to the card.
+ *
+ * When the card receives the message this function starts a connection test.
+ */
+static __always_inline void
+scif_exit_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+#ifdef _MIC_SCIF_
+ printk("card: scif node %d exiting\n", ms_info.mi_nodeid);
+ scif_dev[ms_info.mi_nodeid].sd_state = SCIFDEV_STOPPED;
+ wake_up(&ms_info.mi_exitwq);
+#else
+ printk("host: scif node %d exiting\n", msg->src.node);
+ /* The interrupt handler that received the message would have
+ * bumped up the ref_cnt by 1. micscif_removehost_response
+ * calls micscif_cleanup_scifdev which loops forever for the ref_cnt
+ * to drop to 0 thereby leading to a soft lockup. To prevent
+ * that, decrement the ref_cnt here.
+ */
+ micscif_dec_node_refcnt(scifdev, 1);
+ micscif_removehost_respose(scifdev, msg);
+ /* increment the ref_cnt here. The interrupt handler will now
+ * decrement it, leaving the ref_cnt to 0 if everything
+ * works as expected. Note that its not absolutely necessary
+ * to do this execpt to make sure ref_cnt is 0 and to catch
+ * errors that may happen if ref_cnt drops to a negative value.
+ */
+ micscif_inc_node_refcnt(scifdev, 1);
+
+#endif
+}
+
+/**
+ * scif_nodeadd_resp() - Respond to SCIF_NODE_ADD interrupt message
+ * @scifdev: Other node device to respond to
+ * @msg: Interrupt message
+ *
+ * When the host driver has finished initializing a MIC node queue pair it
+ * marks the board as online. It then looks for all currently online MIC
+ * cards and send a SCIF_NODE_ADD message to identify the ID of the new card for
+ * peer to peer initialization
+ *
+ * The local node allocates its incoming queue and sends its address in the
+ * SCIF_NODE_ADD_ACK message back to the host, the host "reflects" this message
+ * to the new node
+ */
+static __always_inline void
+scif_nodeadd_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+#ifdef _MIC_SCIF_
+ struct micscif_dev *newdev;
+ dma_addr_t qp_offset;
+ int qp_connect;
+
+ pr_debug("Scifdev %d:%d received NODE_ADD msg for node %d\n",
+ scifdev->sd_node, msg->dst.node, msg->src.node);
+ pr_debug("Remote address for this node's aperture %llx\n",
+ msg->payload[0]);
+ printk("Remote node's sbox %llx\n", msg->payload[1]);
+
+ newdev = &scif_dev[msg->src.node];
+ newdev->sd_node = msg->src.node;
+
+ if (micscif_setup_interrupts(newdev)) {
+ printk(KERN_ERR "failed to setup interrupts for %d\n", msg->src.node);
+ goto interrupt_setup_error;
+ }
+
+ newdev->mm_sbox = ioremap_nocache(msg->payload[1] + SBOX_OFFSET, SBOX_MMIO_LENGTH);
+
+ if (!newdev->mm_sbox) {
+ printk(KERN_ERR "failed to map mmio for %d\n", msg->src.node);
+ goto mmio_map_error;
+ }
+
+ if (!(newdev->qpairs = kzalloc(sizeof(struct micscif_qp), GFP_KERNEL))) {
+ printk(KERN_ERR "failed to allocate qpair for %d\n", msg->src.node);
+ goto qp_alloc_error;
+ }
+
+ /* Set the base address of the remote node's memory since it gets
+ * added to qp_offset
+ */
+ newdev->sd_base_addr = msg->payload[0];
+
+ if ((qp_connect = micscif_setup_qp_connect(newdev->qpairs, &qp_offset,
+ NODE_QP_SIZE, newdev))) {
+ printk(KERN_ERR "failed to setup qp_connect %d\n", qp_connect);
+ goto qp_connect_error;
+ }
+
+ if (register_scif_intr_handler(newdev))
+ goto qp_connect_error;
+
+ newdev->scif_ref_cnt = (atomic_long_t) ATOMIC_LONG_INIT(0);
+ micscif_node_add_callback(msg->src.node);
+ newdev->qpairs->magic = SCIFEP_MAGIC;
+ newdev->qpairs->qp_state = QP_OFFLINE;
+ wmb();
+
+ msg->uop = SCIF_NODE_ADD_ACK;
+ msg->dst.node = msg->src.node;
+ msg->src.node = ms_info.mi_nodeid;
+ msg->payload[0] = qp_offset;
+ msg->payload[2] = get_rdmasr_offset(newdev->sd_intr_handle);
+ msg->payload[3] = scif_dev[ms_info.mi_nodeid].sd_numa_node;
+ micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], msg, NULL);
+ return;
+
+qp_connect_error:
+ kfree(newdev->qpairs);
+ newdev->qpairs = NULL;
+qp_alloc_error:
+ iounmap(newdev->mm_sbox);
+ newdev->mm_sbox = NULL;
+mmio_map_error:
+interrupt_setup_error:
+ printk(KERN_ERR "node add failed for node %d\n", msg->src.node);
+ /*
+ * Update self with NODE ADD failure and send
+ * nack to update the peer.
+ */
+ mutex_lock(&newdev->sd_lock);
+ newdev->sd_state = SCIFDEV_NOTPRESENT;
+ mutex_unlock(&newdev->sd_lock);
+ wake_up_interruptible(&newdev->sd_p2p_wq);
+ msg->uop = SCIF_NODE_ADD_NACK;
+ msg->dst.node = msg->src.node;
+ msg->src.node = ms_info.mi_nodeid;
+ micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], msg, NULL);
+#endif
+}
+
+#ifdef _MIC_SCIF_
+static inline void scif_p2pdev_uninit(struct micscif_dev *peerdev)
+{
+ deregister_scif_intr_handler(peerdev);
+ iounmap(peerdev->mm_sbox);
+ mutex_lock(&peerdev->sd_lock);
+ peerdev->sd_state = SCIFDEV_NOTPRESENT;
+ mutex_unlock(&peerdev->sd_lock);
+}
+
+void scif_poll_qp_state(struct work_struct *work)
+{
+#define NODE_QP_RETRY 100
+ struct micscif_dev *peerdev = container_of(work, struct micscif_dev,
+ sd_p2p_dwork.work);
+ struct micscif_qp *qp = &peerdev->qpairs[0];
+
+ if (SCIFDEV_RUNNING != peerdev->sd_state)
+ return;
+ if (qp->qp_state == QP_OFFLINE) {
+ if (peerdev->sd_p2p_retry++ == NODE_QP_RETRY) {
+ printk(KERN_ERR "Warning: QP check timeout with "
+ "state %d\n", qp->qp_state);
+ goto timeout;
+ }
+ schedule_delayed_work(&peerdev->sd_p2p_dwork,
+ msecs_to_jiffies(NODE_QP_TIMEOUT));
+ return;
+ }
+ wake_up(&peerdev->sd_p2p_wq);
+ return;
+timeout:
+ printk(KERN_ERR "%s %d remote node %d offline, state = 0x%x\n",
+ __func__, __LINE__, peerdev->sd_node, qp->qp_state);
+ micscif_inc_node_refcnt(peerdev, 1);
+ qp->remote_qp->qp_state = QP_OFFLINE;
+ micscif_dec_node_refcnt(peerdev, 1);
+ scif_p2pdev_uninit(peerdev);
+ wake_up(&peerdev->sd_p2p_wq);
+}
+#endif
+
+/**
+ * scif_nodeaddack_resp() - Respond to SCIF_NODE_ADD_ACK interrupt message
+ * @scifdev: Other node device to respond to
+ * @msg: Interrupt message
+ *
+ * After a MIC node receives the SCIF_LINK_ADD_ACK message it send this
+ * message to the host to confirm the sequeuce is finished.
+ *
+ */
+static __always_inline void
+scif_nodeaddack_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+#ifdef _MIC_SCIF_
+ struct micscif_dev *peerdev;
+ struct micscif_qp *qp;
+#else
+ struct micscif_dev *dst_dev = &scif_dev[msg->dst.node];
+#endif
+ pr_debug("Scifdev %d received SCIF_NODE_ADD_ACK msg for src %d dst %d\n",
+ scifdev->sd_node, msg->src.node, msg->dst.node);
+ pr_debug("payload %llx %llx %llx %llx\n", msg->payload[0], msg->payload[1],
+ msg->payload[2], msg->payload[3]);
+#ifndef _MIC_SCIF_
+
+ /* the lock serializes with micscif_setuphost_response
+ * The host is forwarding the NODE_ADD_ACK message from src to dst
+ * we need to make sure that the dst has already received a NODE_ADD
+ * for src and setup its end of the qp to dst
+ */
+ mutex_lock(&ms_info.mi_conflock);
+ msg->payload[1] = ms_info.mi_maxid;
+ mutex_unlock(&ms_info.mi_conflock);
+ micscif_inc_node_refcnt(dst_dev, 1);
+ micscif_nodeqp_send(dst_dev, msg, NULL);
+ micscif_dec_node_refcnt(dst_dev, 1);
+#else
+ peerdev = &scif_dev[msg->src.node];
+ peerdev->sd_node = msg->src.node;
+
+ if (peerdev->sd_state == SCIFDEV_NOTPRESENT)
+ return;
+
+ qp = &peerdev->qpairs[0];
+
+ if ((micscif_setup_qp_connect_response(peerdev, &peerdev->qpairs[0],
+ msg->payload[0])))
+ goto local_error;
+
+ mutex_lock(&peerdev->sd_lock);
+ peerdev->sd_numa_node = msg->payload[3];
+ /*
+ * Proxy the DMA only for P2P reads with transfer size
+ * greater than proxy DMA threshold. Proxying reads to convert
+ * them into writes is only required for host jaketown platforms
+ * when the two MIC devices are connected to the same
+ * QPI/IOH/numa node. The host will not pass the numa node
+ * information for non Intel Jaketown platforms and it will
+ * be -1 in that case.
+ */
+ peerdev->sd_proxy_dma_reads =
+ mic_p2p_proxy_enable &&
+ scif_dev[ms_info.mi_nodeid].sd_numa_node != -1 &&
+ (peerdev->sd_numa_node ==
+ scif_dev[ms_info.mi_nodeid].sd_numa_node);
+ peerdev->sd_state = SCIFDEV_RUNNING;
+ mutex_unlock(&peerdev->sd_lock);
+
+ mutex_lock(&ms_info.mi_conflock);
+ ms_info.mi_maxid = msg->payload[1];
+ peerdev->sd_rdmasr = msg->payload[2];
+ mutex_unlock(&ms_info.mi_conflock);
+
+ /* accessing the peer qp. Make sure the peer is awake*/
+ micscif_inc_node_refcnt(peerdev, 1);
+ qp->remote_qp->qp_state = QP_ONLINE;
+ micscif_dec_node_refcnt(peerdev, 1);
+ schedule_delayed_work(&peerdev->sd_p2p_dwork,
+ msecs_to_jiffies(NODE_QP_TIMEOUT));
+ return;
+local_error:
+ scif_p2pdev_uninit(peerdev);
+ wake_up(&peerdev->sd_p2p_wq);
+#endif
+}
+
+/**
+ * scif_cnctreq_resp() - Respond to SCIF_CNCT_REQ interrupt message
+ * @msg: Interrupt message
+ *
+ * This message is initiated by the remote node to request a connection
+ * to the local node. This function looks for an end point in the
+ * listen state on the requested port id.
+ *
+ * If it finds a listening port it places the connect request on the
+ * listening end points queue and wakes up any pending accept calls.
+ *
+ * If it does not find a listening end point it sends a connection
+ * reject message to the remote node.
+ */
+static __always_inline void
+scif_cnctreq_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct endpt *ep = NULL;
+ struct conreq *conreq;
+ unsigned long sflags;
+
+ if ((conreq = (struct conreq *)kmalloc(sizeof(struct conreq), GFP_KERNEL)) == NULL) {
+ // Lack of resources so reject the request.
+ goto conreq_sendrej;
+ }
+
+ if ((ep = micscif_find_listen_ep(msg->dst.port, &sflags)) == NULL) {
+ // Send reject due to no listening ports
+ goto conreq_sendrej_free;
+ }
+
+ if (ep->backlog <= ep->conreqcnt) {
+ // Send reject due to too many pending requests
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ goto conreq_sendrej_free;
+ }
+
+ conreq->msg = *msg;
+ list_add_tail(&conreq->list, &ep->conlist);
+ ep->conreqcnt++;
+ spin_unlock_irqrestore(&ep->lock, sflags);
+
+ wake_up_interruptible(&ep->conwq);
+ return;
+
+conreq_sendrej_free:
+ kfree(conreq);
+conreq_sendrej:
+ msg->uop = SCIF_CNCT_REJ;
+ micscif_nodeqp_send(&scif_dev[msg->src.node], msg, NULL);
+}
+
+/**
+ * scif_cnctgnt_resp() - Respond to SCIF_CNCT_GNT interrupt message
+ * @msg: Interrupt message
+ *
+ * An accept() on the remote node has occured and sent this message
+ * to indicate success. Place the end point in the MAPPING state and
+ * save the remote nodes memory information. Then wake up the connect
+ * request so it can finish.
+ */
+static __always_inline void
+scif_cnctgnt_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ unsigned long sflags;
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+
+ spin_lock_irqsave(&ep->lock, sflags);
+ if (SCIFEP_CONNECTING == ep->state) {
+ ep->peer.node = msg->src.node;
+ ep->peer.port = msg->src.port;
+ ep->qp_info.cnct_gnt_payload = msg->payload[1];
+ ep->remote_ep = msg->payload[2];
+ ep->state = SCIFEP_MAPPING;
+
+ wake_up_interruptible(&ep->conwq);
+ wake_up(&ep->diswq);
+ }
+ spin_unlock_irqrestore(&ep->lock, sflags);
+}
+
+/**
+ * scif_cnctgntack_resp() - Respond to SCIF_CNCT_GNTACK interrupt message
+ * @msg: Interrupt message
+ *
+ * The remote connection request has finished mapping the local memmory.
+ * Place the connection in the connected state and wake up the pending
+ * accept() call.
+ */
+static __always_inline void
+scif_cnctgntack_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ unsigned long sflags;
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ spin_lock(&ep->lock);
+ // New ep is now connected with all resouces set.
+ ep->state = SCIFEP_CONNECTED;
+ list_add_tail(&ep->list, &ms_info.mi_connected);
+ get_conn_count(scifdev);
+ wake_up(&ep->conwq);
+ spin_unlock(&ep->lock);
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+}
+
+/**
+ * scif_cnctgntnack_resp() - Respond to SCIF_CNCT_GNTNACK interrupt message
+ * @msg: Interrupt message
+ *
+ * The remote connection request failed to map the local memory it was sent.
+ * Place the end point in the CLOSING state to indicate it and wake up
+ * the pending accept();
+ */
+static __always_inline void
+scif_cnctgntnack_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+ unsigned long sflags;
+
+ spin_lock_irqsave(&ep->lock, sflags);
+ ep->state = SCIFEP_CLOSING;
+ wake_up(&ep->conwq);
+ spin_unlock_irqrestore(&ep->lock, sflags);
+}
+
+/**
+ * scif_cnctrej_resp() - Respond to SCIF_CNCT_REJ interrupt message
+ * @msg: Interrupt message
+ *
+ * The remote end has rejected the connection request. Set the end
+ * point back to the bound state and wake up the pending connect().
+ */
+static __always_inline void
+scif_cnctrej_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+ unsigned long sflags;
+
+ spin_lock_irqsave(&ep->lock, sflags);
+ if (SCIFEP_CONNECTING == ep->state) {
+ ep->state = SCIFEP_BOUND;
+ wake_up_interruptible(&ep->conwq);
+ }
+ spin_unlock_irqrestore(&ep->lock, sflags);
+}
+
+/**
+ * scif_cnctterm_resp() - Respond to SCIF_CNCT_TERM interrupt message
+ * @msg: Interrupt message
+ *
+ * The remote connect() has waited to long for an accept() to occur and
+ * is removing the connection request.
+ *
+ * If the connection request is not found then it is currently being
+ * processed and a NACK is sent to indicate to the remote connect() to
+ * wait for connection to complete.
+ *
+ * Otherwise the request is removed and an ACK is returned to indicate
+ * success.
+ */
+static __always_inline void
+scif_cnctterm_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ unsigned long sflags;
+ struct endpt *ep = NULL;
+ struct conreq *conreq = NULL;
+
+ ep = micscif_find_listen_ep(msg->dst.port, &sflags);
+
+ if (ep != NULL) {
+ conreq = miscscif_get_connection_request(ep, msg->payload[0]);
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ }
+
+ if (conreq != NULL) {
+ kfree(conreq);
+ msg->uop = SCIF_TERM_ACK;
+ micscif_nodeqp_send(&scif_dev[msg->src.node], msg, NULL);
+ }
+}
+
+/**
+ * scif_termack_resp() - Respond to SCIF_TERM_ACK interrupt message
+ * @msg: Interrupt message
+ *
+ * Connection termination has been confirmed so set the end point
+ * to bound and allow the connection request to error out.
+ */
+static __always_inline void
+scif_termack_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+ unsigned long sflags;
+
+ spin_lock_irqsave(&ep->lock, sflags);
+ if (ep->state != SCIFEP_BOUND) {
+ ep->state = SCIFEP_BOUND;
+ wake_up(&ep->diswq);
+ }
+ spin_unlock_irqrestore(&ep->lock, sflags);
+}
+
+/**
+ * scif_discnct_resp() - Respond to SCIF_DISCNCT interrupt message
+ * @msg: Interrupt message
+ *
+ * The remote node has indicated close() has been called on its end
+ * point. Remove the local end point from the connected list, set its
+ * state to disconnected and ensure accesses to the remote node are
+ * shutdown.
+ *
+ * When all accesses to the remote end have completed then send a
+ * DISCNT_ACK to indicate it can remove its resources and complete
+ * the close routine.
+ */
+static __always_inline void
+scif_discnct_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ unsigned long sflags;
+ struct endpt *ep = NULL;
+ struct endpt *tmpep;
+ struct list_head *pos, *tmpq;
+
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ list_for_each_safe(pos, tmpq, &ms_info.mi_connected) {
+ tmpep = list_entry(pos, struct endpt, list);
+ if (((uint64_t)tmpep == msg->payload[1]) && ((uint64_t)tmpep->remote_ep == msg->payload[0])) {
+ list_del(pos);
+ put_conn_count(scifdev);
+ ep = tmpep;
+ spin_lock(&ep->lock);
+ break;
+ }
+ }
+
+ // If the terminated end is not found then this side started closing
+ // before the other side sent the disconnect. If so the ep will no
+ // longer be on the connected list. Reguardless the other side
+ // needs to be acked to let it know close is complete.
+ if (ep == NULL) {
+ // Need to unlock conn lock and restore irq state
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+ goto discnct_resp_ack;
+ }
+
+ ep->state = SCIFEP_DISCONNECTED;
+ list_add_tail(&ep->list, &ms_info.mi_disconnected);
+
+ // TODO Cause associated resources to be freed.
+ // First step: wake up threads blocked in send and recv
+ wake_up_interruptible(&ep->sendwq);
+ wake_up_interruptible(&ep->recvwq);
+ wake_up_interruptible(&ep->conwq);
+ spin_unlock(&ep->lock);
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+
+discnct_resp_ack:
+ msg->uop = SCIF_DISCNT_ACK;
+ micscif_nodeqp_send(&scif_dev[msg->src.node], msg, NULL);
+}
+
+/**
+ * scif_discnctack_resp() - Respond to SCIF_DISCNT_ACK interrupt message
+ * @msg: Interrupt message
+ *
+ * Remote side has indicated it has not more references to local resources
+ */
+static __always_inline void
+scif_discntack_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+ unsigned long sflags;
+
+ spin_lock_irqsave(&ep->lock, sflags);
+ ep->state = SCIFEP_DISCONNECTED;
+ wake_up(&ep->disconwq);
+ spin_unlock_irqrestore(&ep->lock, sflags);
+}
+
+/**
+ * scif_clientsend_resp() - Respond to SCIF_CLIENT_SEND interrupt message
+ * @msg: Interrupt message
+ *
+ * Remote side is confirming send or recieve interrupt handling is complete.
+ */
+static __always_inline void
+scif_clientsend_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+
+ if (SCIFEP_CONNECTED == ep->state) {
+ wake_up_interruptible(&ep->recvwq);
+ }
+}
+
+/**
+ * scif_clientrcvd_resp() - Respond to SCIF_CLIENT_RCVD interrupt message
+ * @msg: Interrupt message
+ *
+ * Remote side is confirming send or recieve interrupt handling is complete.
+ */
+static __always_inline void
+scif_clientrcvd_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+
+ if (SCIFEP_CONNECTED == ep->state) {
+ wake_up_interruptible(&ep->sendwq);
+ }
+}
+
+/**
+ * scif_alloc_req: Respond to SCIF_ALLOC_REQ interrupt message
+ * @msg: Interrupt message
+ *
+ * Remote side is requesting a memory allocation.
+ */
+static __always_inline void
+scif_alloc_req(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ int err, opcode = (int)msg->payload[3];
+ struct reg_range_t *window = 0;
+ size_t nr_pages = msg->payload[1];
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+
+ might_sleep();
+
+ if (SCIFEP_CONNECTED != ep->state) {
+ err = -ENOTCONN;
+ goto error;
+ }
+
+ switch (opcode) {
+ case SCIF_REGISTER:
+ if (!(window = micscif_create_remote_window(ep,
+ (int)nr_pages))) {
+ err = -ENOMEM;
+ goto error;
+ }
+ break;
+ default:
+ /* Unexpected allocation request */
+ printk(KERN_ERR "Unexpected allocation request opcode 0x%x ep = 0x%p "
+ " scifdev->sd_state 0x%x scifdev->sd_node 0x%x\n",
+ opcode, ep, scifdev->sd_state, scifdev->sd_node);
+ err = -EINVAL;
+ goto error;
+ };
+
+ /* The peer's allocation request is granted */
+ msg->uop = SCIF_ALLOC_GNT;
+ msg->payload[0] = (uint64_t)window;
+ msg->payload[1] = window->mapped_offset;
+ if ((err = micscif_nodeqp_send(ep->remote_dev, msg, ep)))
+ micscif_destroy_remote_window(ep, window);
+ return;
+error:
+ /* The peer's allocation request is rejected */
+ printk(KERN_ERR "%s %d error %d alloc_ptr %p nr_pages 0x%lx\n",
+ __func__, __LINE__, err, window, nr_pages);
+ msg->uop = SCIF_ALLOC_REJ;
+ micscif_nodeqp_send(ep->remote_dev, msg, ep);
+}
+
+/**
+ * scif_alloc_gnt_rej: Respond to SCIF_ALLOC_GNT/REJ interrupt message
+ * @msg: Interrupt message
+ *
+ * Remote side responded to a memory allocation.
+ */
+static __always_inline void
+scif_alloc_gnt_rej(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct allocmsg *handle = (struct allocmsg *)msg->payload[2];
+ switch (handle->uop) {
+ case SCIF_REGISTER:
+ {
+ handle->vaddr = (void *)msg->payload[0];
+ handle->phys_addr = msg->payload[1];
+ if (msg->uop == SCIF_ALLOC_GNT)
+ handle->state = OP_COMPLETED;
+ else
+ handle->state = OP_FAILED;
+ wake_up(&handle->allocwq);
+ break;
+ }
+ default:
+ {
+ printk(KERN_ERR "Bug Unknown alloc uop 0x%x\n", handle->uop);
+ }
+ }
+}
+
+/**
+ * scif_free_phys: Respond to SCIF_FREE_PHYS interrupt message
+ * @msg: Interrupt message
+ *
+ * Remote side is done accessing earlier memory allocation.
+ * Remove GTT/PCI mappings created earlier.
+ */
+static __always_inline void
+scif_free_phys(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ return;
+}
+
+/**
+ * scif_free_phys: Respond to SCIF_FREE_VIRT interrupt message
+ * @msg: Interrupt message
+ *
+ * Free up memory kmalloc'd earlier.
+ */
+static __always_inline void
+scif_free_virt(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+ int opcode = (int)msg->payload[3];
+ struct reg_range_t *window =
+ (struct reg_range_t *)msg->payload[1];
+
+ switch (opcode) {
+ case SCIF_REGISTER:
+ micscif_destroy_remote_window(ep, window);
+ break;
+ default:
+ /* Unexpected allocation request */
+ BUG_ON(opcode != SCIF_REGISTER);
+ };
+}
+
+/**
+ * scif_recv_register: Respond to SCIF_REGISTER interrupt message
+ * @msg: Interrupt message
+ *
+ * Update remote window list with a new registered window.
+ */
+static __always_inline void
+scif_recv_register(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ unsigned long sflags;
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+ struct reg_range_t *window =
+ (struct reg_range_t *)msg->payload[1];
+
+ might_sleep();
+ RMA_MAGIC(window);
+ mutex_lock(&ep->rma_info.rma_lock);
+ /* FIXME:
+ * ep_lock lock needed ? rma_lock is already held
+ */
+ spin_lock_irqsave(&ep->lock, sflags);
+ if (SCIFEP_CONNECTED == ep->state) {
+ msg->uop = SCIF_REGISTER_ACK;
+ micscif_nodeqp_send(ep->remote_dev, msg, ep);
+ micscif_set_nr_pages(ep->remote_dev, window);
+ /* No further failures expected. Insert new window */
+ micscif_insert_window(window,
+ &ep->rma_info.remote_reg_list);
+ } else {
+ msg->uop = SCIF_REGISTER_NACK;
+ micscif_nodeqp_send(ep->remote_dev, msg, ep);
+ }
+ spin_unlock_irqrestore(&ep->lock, sflags);
+ mutex_unlock(&ep->rma_info.rma_lock);
+ /*
+ * We could not insert the window but we need to
+ * destroy the window.
+ */
+ if (SCIF_REGISTER_NACK == msg->uop)
+ micscif_destroy_remote_window(ep, window);
+ else {
+#ifdef _MIC_SCIF_
+ micscif_destroy_remote_lookup(ep, window);
+#endif
+ }
+}
+
+/**
+ * scif_recv_unregister: Respond to SCIF_UNREGISTER interrupt message
+ * @msg: Interrupt message
+ *
+ * Remove window from remote registration list;
+ */
+static __always_inline void
+scif_recv_unregister(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct micscif_rma_req req;
+ struct reg_range_t *window = NULL;
+ struct reg_range_t *recv_window =
+ (struct reg_range_t *)msg->payload[0];
+ struct endpt *ep;
+ int del_window = 0;
+
+ might_sleep();
+ RMA_MAGIC(recv_window);
+ ep = (struct endpt *)recv_window->ep;
+ req.out_window = &window;
+ req.offset = recv_window->offset;
+ req.prot = 0;
+ req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT;
+ req.type = WINDOW_FULL;
+ req.head = &ep->rma_info.remote_reg_list;
+ msg->payload[0] = ep->remote_ep;
+
+ mutex_lock(&ep->rma_info.rma_lock);
+ /*
+ * Does a valid window exist?
+ */
+ if (micscif_query_window(&req)) {
+ printk(KERN_ERR "%s %d -ENXIO\n", __func__, __LINE__);
+ msg->uop = SCIF_UNREGISTER_ACK;
+ goto error;
+ }
+ if (window) {
+ RMA_MAGIC(window);
+ if (window->ref_count)
+ put_window_ref_count(window, window->nr_pages);
+ window->unreg_state = OP_COMPLETED;
+ if (!window->ref_count) {
+ msg->uop = SCIF_UNREGISTER_ACK;
+ atomic_inc(&ep->rma_info.tw_refcount);
+ atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tw_total_pages);
+ ep->rma_info.async_list_del = 1;
+ list_del(&window->list_member);
+ window->offset = INVALID_VA_GEN_ADDRESS;
+ del_window = 1;
+ } else
+ /* NACK! There are valid references to this window */
+ msg->uop = SCIF_UNREGISTER_NACK;
+ } else {
+ /* The window did not make its way to the list at all. ACK */
+ msg->uop = SCIF_UNREGISTER_ACK;
+ micscif_destroy_remote_window(ep, recv_window);
+ }
+error:
+ mutex_unlock(&ep->rma_info.rma_lock);
+ if (del_window)
+ drain_dma_intr(ep->rma_info.dma_chan);
+ micscif_nodeqp_send(ep->remote_dev, msg, ep);
+ if (del_window)
+ micscif_queue_for_cleanup(window, &ms_info.mi_rma);
+ return;
+}
+
+/**
+ * scif_recv_register_ack: Respond to SCIF_REGISTER_ACK interrupt message
+ * @msg: Interrupt message
+ *
+ * Wake up the window waiting to complete registration.
+ */
+static __always_inline void
+scif_recv_register_ack(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct reg_range_t *window =
+ (struct reg_range_t *)msg->payload[2];
+ RMA_MAGIC(window);
+ window->reg_state = OP_COMPLETED;
+ wake_up(&window->regwq);
+}
+
+/**
+ * scif_recv_register_nack: Respond to SCIF_REGISTER_NACK interrupt message
+ * @msg: Interrupt message
+ *
+ * Wake up the window waiting to inform it that registration
+ * cannot be completed.
+ */
+static __always_inline void
+scif_recv_register_nack(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct reg_range_t *window =
+ (struct reg_range_t *)msg->payload[2];
+ RMA_MAGIC(window);
+ window->reg_state = OP_FAILED;
+ wake_up(&window->regwq);
+}
+/**
+ * scif_recv_unregister_ack: Respond to SCIF_UNREGISTER_ACK interrupt message
+ * @msg: Interrupt message
+ *
+ * Wake up the window waiting to complete unregistration.
+ */
+static __always_inline void
+scif_recv_unregister_ack(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct reg_range_t *window =
+ (struct reg_range_t *)msg->payload[1];
+ RMA_MAGIC(window);
+ window->unreg_state = OP_COMPLETED;
+ wake_up(&window->unregwq);
+}
+
+/**
+ * scif_recv_unregister_nack: Respond to SCIF_UNREGISTER_NACK interrupt message
+ * @msg: Interrupt message
+ *
+ * Wake up the window waiting to inform it that unregistration
+ * cannot be completed immediately.
+ */
+static __always_inline void
+scif_recv_unregister_nack(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct reg_range_t *window =
+ (struct reg_range_t *)msg->payload[1];
+ RMA_MAGIC(window);
+ window->unreg_state = OP_FAILED;
+ wake_up(&window->unregwq);
+}
+
+static __always_inline void
+scif_recv_munmap(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct micscif_rma_req req;
+ struct reg_range_t *window = NULL;
+ struct reg_range_t *recv_window =
+ (struct reg_range_t *)msg->payload[0];
+ struct endpt *ep;
+ int del_window = 0;
+
+ might_sleep();
+ RMA_MAGIC(recv_window);
+ ep = (struct endpt *)recv_window->ep;
+ req.out_window = &window;
+ req.offset = recv_window->offset;
+ req.prot = recv_window->prot;
+ req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT;
+ req.type = WINDOW_FULL;
+ req.head = &ep->rma_info.reg_list;
+ msg->payload[0] = ep->remote_ep;
+
+ mutex_lock(&ep->rma_info.rma_lock);
+ /*
+ * Does a valid window exist?
+ */
+ if (micscif_query_window(&req)) {
+ printk(KERN_ERR "%s %d -ENXIO\n", __func__, __LINE__);
+ msg->uop = SCIF_UNREGISTER_ACK;
+ goto error;
+ }
+
+ RMA_MAGIC(window);
+
+ if (window->ref_count)
+ put_window_ref_count(window, window->nr_pages);
+
+ if (!window->ref_count) {
+ atomic_inc(&ep->rma_info.tw_refcount);
+ atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tw_total_pages);
+ ep->rma_info.async_list_del = 1;
+ list_del(&window->list_member);
+ micscif_free_window_offset(ep, window->offset,
+ window->nr_pages << PAGE_SHIFT);
+ window->offset_freed = true;
+ del_window = 1;
+ }
+error:
+ mutex_unlock(&ep->rma_info.rma_lock);
+ if (del_window)
+ micscif_queue_for_cleanup(window, &ms_info.mi_rma);
+}
+
+/**
+ * scif_recv_mark: Handle SCIF_MARK request
+ * @msg: Interrupt message
+ *
+ * The peer has requested a mark.
+ */
+static __always_inline void
+scif_recv_mark(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+ int mark;
+
+ if (SCIFEP_CONNECTED != ep->state) {
+ msg->payload[0] = ep->remote_ep;
+ msg->uop = SCIF_MARK_NACK;
+ micscif_nodeqp_send(ep->remote_dev, msg, ep);
+ return;
+ }
+
+ if ((mark = micscif_fence_mark(ep)) < 0)
+ msg->uop = SCIF_MARK_NACK;
+ else
+ msg->uop = SCIF_MARK_ACK;
+ msg->payload[0] = ep->remote_ep;
+ msg->payload[2] = mark;
+ micscif_nodeqp_send(ep->remote_dev, msg, ep);
+}
+
+/**
+ * scif_recv_mark_resp: Handle SCIF_MARK_(N)ACK messages.
+ * @msg: Interrupt message
+ *
+ * The peer has responded to a SCIF_MARK message.
+ */
+static __always_inline void
+scif_recv_mark_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+ struct fence_info *fence_req = (struct fence_info *)msg->payload[1];
+
+ mutex_lock(&ep->rma_info.rma_lock);
+ if (SCIF_MARK_ACK == msg->uop) {
+ fence_req->state = OP_COMPLETED;
+ fence_req->dma_mark = (int)msg->payload[2];
+ } else
+ fence_req->state = OP_FAILED;
+ wake_up(&fence_req->wq);
+ mutex_unlock(&ep->rma_info.rma_lock);
+}
+
+/**
+ * scif_recv_wait: Handle SCIF_WAIT request
+ * @msg: Interrupt message
+ *
+ * The peer has requested waiting on a fence.
+ */
+static __always_inline void
+scif_recv_wait(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+ struct remote_fence_info *fence;
+
+ /*
+ * Allocate structure for remote fence information and
+ * send a NACK if the allocation failed. The peer will
+ * return ENOMEM upon receiving a NACK.
+ */
+ if (!(fence = (struct remote_fence_info *)kmalloc(
+ sizeof(struct remote_fence_info), GFP_KERNEL))) {
+ msg->payload[0] = ep->remote_ep;
+ msg->uop = SCIF_WAIT_NACK;
+ micscif_nodeqp_send(ep->remote_dev, msg, ep);
+ return;
+ }
+
+ /* Prepare the fence request */
+ memcpy(&fence->msg, msg, sizeof(struct nodemsg));
+ INIT_LIST_HEAD(&fence->list_member);
+
+ /* Insert to the global remote fence request list */
+ mutex_lock(&ms_info.mi_fencelock);
+ ep->rma_info.fence_refcount++;
+ list_add_tail(&fence->list_member, &ms_info.mi_fence);
+ mutex_unlock(&ms_info.mi_fencelock);
+
+ queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work);
+}
+
+/**
+ * scif_recv_wait_resp: Handle SCIF_WAIT_(N)ACK messages.
+ * @msg: Interrupt message
+ *
+ * The peer has responded to a SCIF_WAIT message.
+ */
+static __always_inline void
+scif_recv_wait_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+ struct fence_info *fence_req = (struct fence_info *)msg->payload[1];
+
+ mutex_lock(&ep->rma_info.rma_lock);
+ if (SCIF_WAIT_ACK == msg->uop)
+ fence_req->state = OP_COMPLETED;
+ else
+ fence_req->state = OP_FAILED;
+ wake_up(&fence_req->wq);
+ mutex_unlock(&ep->rma_info.rma_lock);
+}
+
+/**
+ * scif_recv_local_signal: Handle SCIF_SIG_LOCAL request
+ * @msg: Interrupt message
+ *
+ * The peer has requested a signal on a local offset.
+ */
+static __always_inline void
+scif_recv_signal_local(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ int err = 0;
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+
+ err = micscif_prog_signal(ep,
+ msg->payload[1],
+ msg->payload[2],
+ RMA_WINDOW_SELF);
+ if (err)
+ msg->uop = SCIF_SIG_NACK;
+ else
+ msg->uop = SCIF_SIG_ACK;
+ msg->payload[0] = ep->remote_ep;
+ if ((err = micscif_nodeqp_send(ep->remote_dev, msg, ep)))
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+}
+
+/**
+ * scif_recv_signal_remote: Handle SCIF_SIGNAL_REMOTE request
+ * @msg: Interrupt message
+ *
+ * The peer has requested a signal on a remote offset.
+ */
+static __always_inline void
+scif_recv_signal_remote(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ int err = 0;
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+
+ err = micscif_prog_signal(ep,
+ msg->payload[1],
+ msg->payload[2],
+ RMA_WINDOW_PEER);
+ if (err)
+ msg->uop = SCIF_SIG_NACK;
+ else
+ msg->uop = SCIF_SIG_ACK;
+ msg->payload[0] = ep->remote_ep;
+ if ((err = micscif_nodeqp_send(ep->remote_dev, msg, ep)))
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+}
+
+/**
+ * scif_recv_signal_remote: Handle SCIF_SIG_(N)ACK messages.
+ * @msg: Interrupt message
+ *
+ * The peer has responded to a signal request.
+ */
+static __always_inline void
+scif_recv_signal_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+ struct fence_info *fence_req = (struct fence_info *)msg->payload[3];
+
+ mutex_lock(&ep->rma_info.rma_lock);
+ if (SCIF_SIG_ACK == msg->uop)
+ fence_req->state = OP_COMPLETED;
+ else
+ fence_req->state = OP_FAILED;
+ wake_up(&fence_req->wq);
+ mutex_unlock(&ep->rma_info.rma_lock);
+}
+
+/*
+ * scif_node_wake_up_ack: Handle SCIF_NODE_WAKE_UP_ACK message
+ * @msg: Interrupt message
+ *
+ * Response for a SCIF_NODE_WAKE_UP message.
+ */
+static __always_inline void
+scif_node_wake_up_ack(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ scif_dev[msg->payload[0]].sd_wait_status = OP_COMPLETED;
+ wake_up(&scif_dev[msg->payload[0]].sd_wq);
+}
+
+/*
+ * scif_node_wake_up_nack: Handle SCIF_NODE_WAKE_UP_NACK message
+ * @msg: Interrupt message
+ *
+ * Response for a SCIF_NODE_WAKE_UP message.
+ */
+static __always_inline void
+scif_node_wake_up_nack(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ scif_dev[msg->payload[0]].sd_wait_status = OP_FAILED;
+ wake_up(&scif_dev[msg->payload[0]].sd_wq);
+}
+
+/*
+ * scif_node_remove: Handle SCIF_NODE_REMOVE message
+ * @msg: Interrupt message
+ *
+ * Handle node removal.
+ */
+static __always_inline void
+scif_node_remove(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ msg->payload[0] = micscif_handle_remove_node(msg->payload[0], msg->payload[1]);
+ msg->uop = SCIF_NODE_REMOVE_ACK;
+ msg->src.node = ms_info.mi_nodeid;
+ micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], msg, NULL);
+}
+
+#ifndef _MIC_SCIF_
+/*
+ * scif_node_remove_ack: Handle SCIF_NODE_REMOVE_ACK message
+ * @msg: Interrupt message
+ *
+ * The peer has acked a SCIF_NODE_REMOVE message.
+ */
+static __always_inline void
+scif_node_remove_ack(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ bool ack_is_current = true;
+ int orig_node = (int)msg->payload[3];
+
+ if ((msg->payload[1] << 32) == DISCONN_TYPE_POWER_MGMT) {
+ if (msg->payload[2] != atomic_long_read(&ms_info.mi_unique_msgid))
+ ack_is_current = false;
+ }
+
+ if (ack_is_current) {
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(orig_node - 1);
+ if (!mic_ctx) {
+ printk(KERN_ERR "%s %d mic_ctx %p orig_node %d\n",
+ __func__, __LINE__, mic_ctx, orig_node);
+ return;
+ }
+
+ if (msg->payload[0]) {
+ pr_debug("%s failed to get remove ack from node id %d", __func__, msg->src.node);
+ ms_info.mi_disconnect_status = OP_FAILED;
+ }
+
+ atomic_inc(&mic_ctx->disconn_rescnt);
+ wake_up(&ms_info.mi_disconn_wq);
+ }
+}
+
+/*
+ * scif_node_create_ack: Handle SCIF_NODE_CREATE_DEP message
+ * @msg: Interrupt message
+ *
+ * Notification about a new SCIF dependency between two nodes.
+ */
+static __always_inline void
+scif_node_create_dep(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ uint32_t src_node = msg->src.node;
+ uint32_t dst_node = (uint32_t)msg->payload[0];
+ /*
+ * Host driver updates dependency graph.
+ * src_node created dependency on dst_node
+ * src_node -> dst_node
+ */
+ micscif_set_nodedep(src_node, dst_node, DEP_STATE_DEPENDENT);
+}
+
+/*
+ * scif_node_destroy_ack: Handle SCIF_NODE_DESTROY_DEP message
+ * @msg: Interrupt message
+ *
+ * Notification about tearing down an existing SCIF dependency
+ * between two nodes.
+ */
+static __always_inline void
+scif_node_destroy_dep(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ uint32_t src_node = msg->src.node;
+ uint32_t dst_node = (uint32_t)msg->payload[0];
+ /*
+ * Host driver updates dependency graph.
+ * src_node removed dependency on dst_node
+ */
+ micscif_set_nodedep(src_node, dst_node, DEP_STATE_NOT_DEPENDENT);
+}
+
+/*
+ * scif_node_wake_up: Handle SCIF_NODE_WAKE_UP message
+ * @msg: Interrupt message
+ *
+ * The host has received a request to wake up a remote node.
+ */
+static __always_inline void
+scif_node_wake_up(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ /*
+ * Host Driver now needs to wake up the remote node
+ * available in msg->payload[0].
+ */
+ uint32_t ret = 0;
+ ret = micscif_connect_node((uint32_t)msg->payload[0], false);
+
+ if(!ret) {
+ msg->uop = SCIF_NODE_WAKE_UP_ACK;
+ micscif_update_p2p_state((uint32_t)msg->payload[0],
+ msg->src.node, SCIFDEV_RUNNING);
+ } else {
+ msg->uop = SCIF_NODE_WAKE_UP_NACK;
+ }
+ micscif_nodeqp_send(&scif_dev[msg->src.node], msg, NULL);
+}
+#endif
+
+#ifdef _MIC_SCIF_
+static __always_inline void
+scif_node_alive_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ msg->uop = SCIF_NODE_ALIVE_ACK;
+ msg->src.node = ms_info.mi_nodeid;
+ msg->dst.node = SCIF_HOST_NODE;
+ micscif_nodeqp_send(&scif_dev[SCIF_HOST_NODE], msg, NULL);
+ pr_debug("node alive ack sent from node %d oops_in_progress %d\n",
+ ms_info.mi_nodeid, oops_in_progress);
+}
+#else
+static __always_inline void
+scif_node_alive_ack(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ pr_debug("node alive ack received from node %d\n", msg->src.node);
+ atomic_set(&scif_dev[msg->src.node].sd_node_alive, 1);
+ wake_up(&scifdev->sd_watchdog_wq);
+}
+#endif
+
+
+#ifdef _MIC_SCIF_
+static __always_inline void
+_scif_proxy_dma(struct micscif_dev *scifdev, struct nodemsg *msg, int flags)
+{
+ struct endpt *ep = (struct endpt *)msg->payload[0];
+ off_t loffset = msg->payload[1];
+ off_t roffset = msg->payload[2];
+ size_t len = msg->payload[3];
+ struct dma_channel *chan = ep->rma_info.dma_chan;
+ struct endpt_rma_info *rma = &ep->rma_info;
+ int err = __scif_writeto(ep, loffset, len, roffset, flags);
+
+ if (!err && rma->proxy_dma_peer_phys &&
+ !request_dma_channel(chan)) {
+ do_status_update(chan, rma->proxy_dma_peer_phys, OP_COMPLETED);
+ free_dma_channel(chan);
+ }
+ if (!rma->proxy_dma_peer_phys)
+ /* The proxy DMA physical address should have been set up? */
+ WARN_ON(1);
+}
+
+/**
+ * scif_proxy_dma: Handle SCIF_PROXY_DMA request.
+ * @msg: Interrupt message
+ *
+ * The peer has requested a Proxy DMA.
+ */
+static __always_inline void
+scif_proxy_dma(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ _scif_proxy_dma(scifdev, msg, 0x0);
+}
+
+/**
+ * scif_proxy_ordered_dma: Handle SCIF_PROXY_ORDERED_DMA request.
+ * @msg: Interrupt message
+ *
+ * The peer has requested an ordered Proxy DMA.
+ */
+static __always_inline void
+scif_proxy_ordered_dma(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ _scif_proxy_dma(scifdev, msg, SCIF_RMA_ORDERED);
+}
+#endif
+
+#ifndef _MIC_SCIF_
+/**
+ * scif_node_connect: Respond to SCIF_NODE_CONNECT interrupt message
+ * @msg: Interrupt message
+ *
+ * Connect the src and dst node by setting up the p2p connection
+ * between them. Host here acts like a proxy.
+ */
+static __always_inline void
+scif_node_connect_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct micscif_dev *dev_j = scifdev;
+ struct micscif_dev *dev_i = NULL;
+ struct scif_p2p_info *p2p_ij = NULL; /* bus addr for j from i */
+ struct scif_p2p_info *p2p_ji = NULL; /* bus addr for i from j */
+ struct scif_p2p_info *p2p;
+ struct list_head *pos, *tmp;
+ uint32_t bid = (uint32_t)msg->payload[0];
+ int err;
+ uint64_t tmppayload;
+
+ pr_debug("%s:%d SCIF_NODE_CONNECT from %d connecting to %d \n",
+ __func__, __LINE__, scifdev->sd_node, bid);
+
+ mutex_lock(&ms_info.mi_conflock);
+ if (bid < 1 || bid > ms_info.mi_maxid) {
+ printk(KERN_ERR "%s %d unknown bid %d\n", __func__, __LINE__, bid);
+ goto nack;
+ }
+
+ dev_i = &scif_dev[bid];
+ mutex_unlock(&ms_info.mi_conflock);
+ micscif_inc_node_refcnt(dev_i, 1);
+ mutex_lock(&ms_info.mi_conflock);
+
+ if (dev_i->sd_state != SCIFDEV_RUNNING)
+ goto ref_nack;
+
+ /*
+ * If the p2p connection is already setup or in the process of setting up
+ * then just ignore this request. The requested node will get informed
+ * by SCIF_NODE_ADD_ACK or SCIF_NODE_ADD_NACK
+ */
+ if (!list_empty(&dev_i->sd_p2p)) {
+ list_for_each_safe(pos, tmp, &dev_i->sd_p2p) {
+ p2p = list_entry(pos, struct scif_p2p_info,
+ ppi_list);
+ if (p2p->ppi_peer_id == dev_j->sd_node) {
+ mutex_unlock(&ms_info.mi_conflock);
+ micscif_dec_node_refcnt(dev_i, 1);
+ return;
+ }
+ }
+ }
+
+ p2p_ij = init_p2p_info(dev_i, dev_j);
+ p2p_ji = init_p2p_info(dev_j, dev_i);
+
+ list_add_tail(&p2p_ij->ppi_list, &dev_i->sd_p2p);
+ list_add_tail(&p2p_ji->ppi_list, &dev_j->sd_p2p);
+
+ /* Send a SCIF_NODE_ADD to dev_i, pass it its bus address
+ * as seen from dev_j
+ */
+ msg->uop = SCIF_NODE_ADD;
+ msg->src.node = dev_j->sd_node;
+ msg->dst.node = dev_i->sd_node;
+
+ p2p_ji->ppi_mic_addr[PPI_APER] = mic_map(msg->src.node - 1,
+ p2p_ji->ppi_pa[PPI_APER],
+ p2p_ji->ppi_len[PPI_APER] << PAGE_SHIFT);
+ msg->payload[0] = p2p_ji->ppi_mic_addr[PPI_APER];
+
+ /* addresses for node j */
+ p2p_ij->ppi_mic_addr[PPI_MMIO] = mic_map(msg->dst.node - 1,
+ p2p_ij->ppi_pa[PPI_MMIO],
+ p2p_ij->ppi_len[PPI_MMIO] << PAGE_SHIFT);
+ msg->payload[1] = p2p_ij->ppi_mic_addr[PPI_MMIO];
+
+ p2p_ij->ppi_mic_addr[PPI_APER] = mic_map(msg->dst.node - 1,
+ p2p_ij->ppi_pa[PPI_APER],
+ p2p_ij->ppi_len[PPI_APER] << PAGE_SHIFT);
+ msg->payload[2] = p2p_ij->ppi_mic_addr[PPI_APER];
+
+ msg->payload[3] = p2p_ij->ppi_len[PPI_APER] << PAGE_SHIFT;
+
+ if ((err = micscif_nodeqp_send(dev_i, msg, NULL))) {
+ printk(KERN_ERR "%s %d error %d\n", __func__, __LINE__, err);
+ goto ref_nack;
+ }
+
+ /* Same as above but to dev_j */
+ msg->uop = SCIF_NODE_ADD;
+ msg->src.node = dev_i->sd_node;
+ msg->dst.node = dev_j->sd_node;
+
+ tmppayload = msg->payload[0];
+ msg->payload[0] = msg->payload[2];
+ msg->payload[2] = tmppayload;
+
+ p2p_ji->ppi_mic_addr[PPI_MMIO] = mic_map(msg->dst.node - 1, p2p_ji->ppi_pa[PPI_MMIO],
+ p2p_ji->ppi_len[PPI_MMIO] << PAGE_SHIFT);
+ msg->payload[1] = p2p_ji->ppi_mic_addr[PPI_MMIO];
+ msg->payload[3] = p2p_ji->ppi_len[PPI_APER] << PAGE_SHIFT;
+
+ if ((err = micscif_nodeqp_send(dev_j, msg, NULL))) {
+ printk(KERN_ERR "%s %d error %d\n", __func__, __LINE__, err);
+ goto ref_nack;
+ }
+
+ mutex_unlock(&ms_info.mi_conflock);
+ micscif_dec_node_refcnt(dev_i, 1);
+ return;
+ref_nack:
+ micscif_dec_node_refcnt(dev_i, 1);
+nack:
+ mutex_unlock(&ms_info.mi_conflock);
+ msg->uop = SCIF_NODE_CONNECT_NACK;
+ msg->dst.node = dev_j->sd_node;
+ msg->payload[0] = bid;
+ if ((err = micscif_nodeqp_send(dev_j, msg, NULL)))
+ printk(KERN_ERR "%s %d error %d\n", __func__, __LINE__, err);
+}
+#endif /* SCIF */
+
+#ifdef _MIC_SCIF_
+/**
+ * scif_node_connect_nack_resp: Respond to SCIF_NODE_CONNECT_NACK interrupt message
+ * @msg: Interrupt message
+ *
+ * Tell the node that initiated SCIF_NODE_CONNECT earlier has failed.
+ */
+static __always_inline void
+scif_node_connect_nack_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ struct micscif_dev *peerdev;
+ unsigned int bid = msg->payload[0];
+
+ if (bid > MAX_BOARD_SUPPORTED) {
+ printk(KERN_ERR "recieved a nack for invalid bid %d\n", bid);
+ WARN_ON(1);
+ return;
+ }
+
+ peerdev = &scif_dev[bid];
+ mutex_lock(&peerdev->sd_lock);
+ peerdev->sd_state = SCIFDEV_NOTPRESENT;
+ mutex_unlock(&peerdev->sd_lock);
+ wake_up(&peerdev->sd_p2p_wq);
+}
+#endif
+
+/**
+ * scif_node_add_nack_resp: Respond to SCIF_NODE_ADD_NACK interrupt message
+ * @msg: Interrupt message
+ *
+ * SCIF_NODE_ADD failed, so inform the waiting wq.
+ */
+static __always_inline void
+scif_node_add_nack_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+#ifndef _MIC_SCIF_
+ struct micscif_dev *dst_dev = &scif_dev[msg->dst.node];
+ pr_debug("SCIF_NODE_ADD_NACK recieved from %d \n", scifdev->sd_node);
+ micscif_inc_node_refcnt(dst_dev, 1);
+ micscif_nodeqp_send(dst_dev, msg, NULL);
+ micscif_dec_node_refcnt(dst_dev, 1);
+#else
+ struct micscif_dev *peerdev;
+
+ peerdev = &scif_dev[msg->src.node];
+
+ if (peerdev->sd_state == SCIFDEV_NOTPRESENT)
+ return;
+
+ mutex_lock(&peerdev->sd_lock);
+ peerdev->sd_state = SCIFDEV_NOTPRESENT;
+ mutex_unlock(&peerdev->sd_lock);
+ wake_up(&peerdev->sd_p2p_wq);
+#endif
+}
+
+/**
+ * scif_get_node_info_resp: Respond to SCIF_GET_NODE_INFO interrupt message
+ * @msg: Interrupt message
+ *
+ * Retrieve node info i.e maxid, total and node mask from the host.
+ */
+static __always_inline void
+scif_get_node_info_resp(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+#ifdef _MIC_SCIF_
+ struct get_node_info *node_info = (struct get_node_info *)msg->payload[3];
+
+ mutex_lock(&ms_info.mi_conflock);
+ ms_info.mi_mask = msg->payload[0];
+ ms_info.mi_maxid = msg->payload[1];
+ ms_info.mi_total = msg->payload[2];
+
+ node_info->state = OP_COMPLETED;
+ wake_up(&node_info->wq);
+ mutex_unlock(&ms_info.mi_conflock);
+#else
+ swap(msg->dst.node, msg->src.node);
+ mutex_lock(&ms_info.mi_conflock);
+ msg->payload[0] = ms_info.mi_mask;
+ msg->payload[1] = ms_info.mi_maxid;
+ msg->payload[2] = ms_info.mi_total;
+ mutex_unlock(&ms_info.mi_conflock);
+
+ if (micscif_nodeqp_send(scifdev, msg, NULL))
+ printk(KERN_ERR "%s %d error \n", __func__, __LINE__);
+#endif
+}
+
+#ifdef ENABLE_TEST
+static void
+scif_test(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ if (msg->payload[0] != scifdev->count) {
+ printk(KERN_ERR "Con fail: payload == %llx\n", msg->payload[0]);
+ scifdev->test_done = -1;
+ } else if (scifdev->count == TEST_LOOP) {
+ pr_debug("Test success state %d!\n", scifdev->sd_state);
+ scifdev->test_done = 1;
+ }
+
+ if (scifdev->test_done != 0) {
+ while (scifdev->test_done != 2) {
+ cpu_relax();
+ schedule();
+ }
+
+ destroy_workqueue(scifdev->producer);
+ destroy_workqueue(scifdev->consumer);
+ pr_debug("Destroyed workqueue state %d!\n", scifdev->sd_state);
+ }
+ scifdev->count++;
+}
+#endif /* ENABLE_TEST */
+
+static void
+scif_msg_unknown(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ /* Bogus Node Qp Message? */
+ printk(KERN_ERR "Unknown message 0x%xn scifdev->sd_state 0x%x "
+ "scifdev->sd_node 0x%x\n",
+ msg->uop, scifdev->sd_state, scifdev->sd_node);
+ BUG_ON(1);
+}
+
+#ifdef _MIC_SCIF_
+static void
+smpt_set(struct micscif_dev *scifdev, struct nodemsg *msg)
+{
+ printk("msd recvd : smpt add\n");
+ printk("dma_addr = 0x%llX, entry = 0x%llX\n", msg->payload[0], msg->payload[1]);
+ mic_smpt_set(scif_dev->mm_sbox, msg->payload[0], msg->payload[1]);
+}
+#endif
+
+void (*scif_intr_func[SCIF_MAX_MSG + 1])(struct micscif_dev *, struct nodemsg *msg) = {
+ scif_msg_unknown, // Error
+ scif_init_resp, // SCIF_INIT
+ scif_exit_resp, // SCIF_EXIT
+ scif_nodeadd_resp, // SCIF_NODE_ADD
+ scif_nodeaddack_resp, // SCIF_NODE_ADD_ACK
+ scif_cnctreq_resp, // SCIF_CNCT_REQ
+ scif_cnctgnt_resp, // SCIF_CNCT_GNT
+ scif_cnctgntack_resp, // SCIF_CNCT_GNTACK
+ scif_cnctgntnack_resp, // SCIF_CNCT_GNTNACK
+ scif_cnctrej_resp, // SCIF_CNCT_REJ
+ scif_cnctterm_resp, // SCIF_CNCT_TERM 10
+ scif_termack_resp, // SCIF_TERM_ACK
+ scif_discnct_resp, // SCIF_DISCNCT
+ scif_discntack_resp, // SCIF_DISCNT_ACK
+ scif_recv_register, // SCIF_REGISTER
+ scif_recv_register_ack, // SCIF_REGISTER_ACK
+ scif_recv_register_nack, // SCIF_REGISTER_NACK
+ scif_recv_unregister, // SCIF_UNREGISTER
+ scif_recv_unregister_ack, // SCIF_UNREGISTER_ACK
+ scif_recv_unregister_nack, // SCIF_UNREGISTER_NACK
+ scif_alloc_req, // SCIF_ALLOC_REQ 20
+ scif_alloc_gnt_rej, // SCIF_ALLOC_GNT
+ scif_alloc_gnt_rej, // SCIF_ALLOC_REJ
+ scif_free_phys, // SCIF_FREE_PHYS
+ scif_free_virt, // SCIF_FREE_VIRT
+ scif_clientsend_resp, // SCIF_CLIENT_SENT
+ scif_clientrcvd_resp, // SCIF_CLIENT_RCVD
+ scif_recv_munmap, // SCIF_MUNMAP
+ scif_recv_mark, // SCIF_MARK
+ scif_recv_mark_resp, // SCIF_MARK_ACK
+ scif_recv_mark_resp, // SCIF_MARK_NACK 30
+ scif_recv_wait, // SCIF_WAIT
+ scif_recv_wait_resp, // SCIF_WAIT_ACK
+ scif_recv_wait_resp, // SCIF_WAIT_NACK
+ scif_recv_signal_local, // SCIF_SIG_LOCAL
+ scif_recv_signal_remote, // SCIF_SIG_REMOTE
+ scif_recv_signal_resp, // SCIF_SIG_ACK
+ scif_recv_signal_resp, // SCIF_SIG_NACK
+#ifdef _MIC_SCIF_
+ scif_msg_unknown,
+ scif_msg_unknown,
+ scif_msg_unknown,
+ scif_msg_unknown,
+ scif_msg_unknown, // SCIF_NODE_CREATE_DEP Not on card
+ scif_msg_unknown, // SCIF_NODE_DESTROY_DEP Not on card
+#else
+ scif_msg_unknown,
+ scif_msg_unknown,
+ scif_msg_unknown,
+ scif_msg_unknown,
+ scif_node_create_dep, // SCIF_NODE_CREATE_DEP
+ scif_node_destroy_dep, // SCIF_NODE_DESTROY_DEP
+#endif
+ scif_node_remove, // SCIF_NODE_REMOVE
+#ifdef _MIC_SCIF_
+ scif_msg_unknown, // SCIF_NODE_REMOVE_ACK Not on card
+ scif_msg_unknown, // SCIF_NODE_WAKE_UP Not on card
+#else
+ scif_node_remove_ack, // SCIF_NODE_REMOVE_ACK
+ scif_node_wake_up, // SCIF_NODE_WAKE_UP
+#endif
+ scif_node_wake_up_ack, // SCIF_NODE_WAKE_UP_ACK
+ scif_node_wake_up_nack, // SCIF_NODE_WAKE_UP_NACK
+#ifdef _MIC_SCIF_
+ scif_node_alive_resp, // SCIF_NODE_ALIVE
+ scif_msg_unknown, // SCIF_NODE_ALIVE_ACK not on card
+ smpt_set, // SMPT_SET
+#else
+ scif_msg_unknown, // SCIF_NODE_ALIVE not on Host
+ scif_node_alive_ack, // SCIF_NODE_ALIVE_ACK
+ scif_msg_unknown, // SCIF_NODE_ALIVE not on Host
+#endif
+ scif_msg_unknown,
+ scif_msg_unknown,
+ scif_msg_unknown,
+ scif_msg_unknown,
+#ifdef _MIC_SCIF_
+ scif_proxy_dma, // SCIF_PROXY_DMA only for MIC
+ scif_proxy_ordered_dma, // SCIF_PROXY_ORDERED_DMA only for MIC
+#else
+ scif_msg_unknown,
+ scif_msg_unknown,
+#endif
+#ifdef _MIC_SCIF_
+ scif_msg_unknown,
+ scif_node_connect_nack_resp, //SCIF_NODE_CONNECT_NACK
+#else
+ scif_node_connect_resp, //SCIF_NODE_CONNECT
+ scif_msg_unknown,
+#endif
+ scif_node_add_nack_resp, //SCIF_NODE_ADD_NACK
+ scif_get_node_info_resp, //SCIF_GET_NODE_INFO
+#ifdef ENABLE_TEST
+ scif_test // SCIF_TEST
+#else
+ scif_msg_unknown
+#endif
+};
+
+/**
+ * scif_nodeqp_msg_hander() - Common handler for node messages
+ * @scifdev: Remote device to respond to
+ * @qp: Remote memory pointer
+ * @msg: The message to be handled.
+ *
+ * This routine calls the appriate routine to handle a Node Qp message receipt.
+ */
+int micscif_max_msg_id = SCIF_MAX_MSG;
+
+static void
+micscif_nodeqp_msg_handler(struct micscif_dev *scifdev, struct micscif_qp *qp, struct nodemsg *msg)
+{
+ micscif_display_message(scifdev, msg, "Rcvd");
+
+ if (msg->uop > (uint32_t)micscif_max_msg_id) {
+ /* Bogus Node Qp Message? */
+ printk(KERN_ERR "Unknown message 0x%xn scifdev->sd_state 0x%x "
+ "scifdev->sd_node 0x%x\n",
+ msg->uop, scifdev->sd_state, scifdev->sd_node);
+ BUG_ON(1);
+ }
+
+ scif_intr_func[msg->uop](scifdev, msg);
+}
+
+/**
+ * scif_nodeqp_intrhander() - Interrupt handler for node messages
+ * @scifdev: Remote device to respond to
+ * @qp: Remote memory pointer
+ *
+ * This routine is triggered by the interrupt mechanism. It reads
+ * messages from the node queue RB and calls the Node QP Message handling
+ * routine.
+ */
+int
+micscif_nodeqp_intrhandler(struct micscif_dev *scifdev, struct micscif_qp *qp)
+{
+ struct nodemsg msg;
+ int read_size;
+
+ do {
+#ifndef _MIC_SCIF_
+ if (qp->blast) {
+ scif_wakeup_ep(SCIF_WAKE_UP_RECV);
+ qp->blast = 0;
+ }
+#endif
+ if (SCIFDEV_STOPPED == scifdev->sd_state)
+ return 0;
+ read_size = micscif_rb_get_next(&qp->inbound_q, &msg,
+ sizeof(msg));
+ /* Stop handling messages if an oops is in progress */
+ if (read_size != sizeof(msg) || oops_in_progress)
+ break;
+#ifndef _MIC_SCIF_
+ atomic_set(&scifdev->sd_node_alive, 1);
+#endif
+
+ micscif_inc_node_refcnt(scifdev, 1);
+ micscif_nodeqp_msg_handler(scifdev, qp, &msg);
+ /*
+ * The reference count is reset to SCIF_NODE_IDLE
+ * during scif device cleanup so decrementing the
+ * reference count further is not required.
+ */
+ if (SCIFDEV_INIT == scifdev->sd_state)
+ return 0;
+ if (SCIFDEV_STOPPED == scifdev->sd_state) {
+ micscif_dec_node_refcnt(scifdev, 1);
+ return 0;
+ }
+ micscif_rb_update_read_ptr(&qp->inbound_q);
+ micscif_dec_node_refcnt(scifdev, 1);
+ } while (read_size == sizeof(msg));
+#ifdef _MIC_SCIF_
+ /*
+ * Keep polling the Node QP RB in case there are active SCIF
+ * P2P connections to provide better Node QP responsiveness
+ * in anticipation of P2P Proxy DMA requests for performance.
+ */
+ if (scifdev->sd_proxy_dma_reads &&
+ scifdev->num_active_conn &&
+ SCIFDEV_STOPPED != scifdev->sd_state) {
+ queue_work(scifdev->sd_intr_wq, &scifdev->sd_intr_bh);
+ schedule();
+ }
+#endif
+ return read_size;
+}
+
+/**
+ * micscif_loopb_wq_handler - Loopback Workqueue Handler.
+ * @work: loop back work
+ *
+ * This work queue routine is invoked by the loopback work queue handler.
+ * It grabs the recv lock, dequeues any available messages from the head
+ * of the loopback message list, calls the node QP message handler,
+ * waits for it to return, then frees up this message and dequeues more
+ * elements of the list if available.
+ */
+static void micscif_loopb_wq_handler(struct work_struct *work)
+{
+ struct micscif_dev *scifdev =
+ container_of(work, struct micscif_dev, sd_loopb_work);
+ struct micscif_qp *qp = micscif_nodeqp_nextmsg(scifdev);
+ struct loopb_msg *msg;
+
+ do {
+ msg = NULL;
+ spin_lock(&qp->qp_recv_lock);
+ if (!list_empty(&scifdev->sd_loopb_recv_q)) {
+ msg = list_first_entry(&scifdev->sd_loopb_recv_q,
+ struct loopb_msg, list_member);
+ list_del(&msg->list_member);
+ }
+ spin_unlock(&qp->qp_recv_lock);
+
+ if (msg) {
+ micscif_nodeqp_msg_handler(scifdev, qp, &msg->msg);
+ kfree(msg);
+ }
+ } while (msg);
+}
+
+/**
+ * micscif_loopb_msg_handler() - Workqueue handler for loopback messages.
+ * @scifdev: SCIF device
+ * @qp: Queue pair.
+ *
+ * This work queue routine is triggered when a loopback message is received.
+ *
+ * We need special handling for receiving Node Qp messages on a loopback SCIF
+ * device via two workqueues for receiving messages.
+ *
+ * The reason we need the extra workqueue which is not required with *normal*
+ * non-loopback SCIF devices is the potential classic deadlock described below:
+ *
+ * Thread A tries to send a message on a loopback SCIF devide and blocks since
+ * there is no space in the RB while it has the qp_send_lock held or another
+ * lock called lock X for example.
+ *
+ * Thread B: The Loopback Node QP message receive workqueue receives the message
+ * and tries to send a message (eg an ACK) to the loopback SCIF device. It tries
+ * to grab the send lock again or lock X and deadlocks with Thread A. The RB
+ * cannot be drained any further due to this classic deadlock.
+ *
+ * In order to avoid deadlocks as mentioned above we have an extra level of
+ * indirection achieved by having two workqueues.
+ * 1) The first workqueue whose handler is micscif_loopb_msg_handler reads
+ * messages from the Node QP RB, adds them to a list and queues work for the
+ * second workqueue.
+ *
+ * 2) The second workqueue whose handler is micscif_loopb_wq_handler dequeues
+ * messages from the list, handles them, frees up the memory and dequeues
+ * more elements from the list if possible.
+ */
+int
+micscif_loopb_msg_handler(struct micscif_dev *scifdev, struct micscif_qp *qp)
+{
+ int read_size;
+ struct loopb_msg *msg;
+
+ do {
+ if (!(msg = kmalloc(sizeof(struct loopb_msg), GFP_KERNEL))) {
+ printk(KERN_ERR "%s %d ENOMEM\n", __func__, __LINE__);
+ return -ENOMEM;
+ }
+
+ read_size = micscif_rb_get_next(&qp->inbound_q, &msg->msg,
+ sizeof(struct nodemsg));
+
+ if (read_size != sizeof(struct nodemsg)) {
+ kfree(msg);
+ micscif_rb_update_read_ptr(&qp->inbound_q);
+ break;
+ }
+
+ spin_lock(&qp->qp_recv_lock);
+ list_add_tail(&msg->list_member, &scifdev->sd_loopb_recv_q);
+ spin_unlock(&qp->qp_recv_lock);
+ queue_work(scifdev->sd_loopb_wq, &scifdev->sd_loopb_work);
+ micscif_rb_update_read_ptr(&qp->inbound_q);
+ } while (read_size == sizeof(struct nodemsg));
+ return read_size;
+}
+
+/**
+ * micscif_setup_loopback_qp - One time setup work for Loopback Node Qp.
+ * @scifdev: SCIF device
+ *
+ * Sets up the required loopback workqueues, queue pairs, ring buffers
+ * and also tests out the Queue Pairs.
+ */
+int micscif_setup_loopback_qp(struct micscif_dev *scifdev)
+{
+ int err = 0;
+ void *local_q;
+ struct micscif_qp *qp;
+
+ /* Set up the work queues */
+ if ((err = micscif_setup_interrupts(scifdev)))
+ goto error;
+
+ INIT_LIST_HEAD(&scifdev->sd_loopb_recv_q);
+ snprintf(scifdev->sd_loopb_wqname, sizeof(scifdev->sd_loopb_wqname),
+ "SCIF LOOPB %d", scifdev->sd_node);
+ if (!(scifdev->sd_loopb_wq =
+ __mic_create_singlethread_workqueue(scifdev->sd_loopb_wqname))){
+ err = -ENOMEM;
+ goto destroy_intr_wq;
+ }
+ INIT_WORK(&scifdev->sd_loopb_work, micscif_loopb_wq_handler);
+ /* Allocate Self Qpair */
+ scifdev->n_qpairs = 1;
+ scifdev->qpairs = (struct micscif_qp *)kzalloc(sizeof(struct micscif_qp), GFP_KERNEL);
+ if (!scifdev->qpairs) {
+ printk(KERN_ERR "Node QP Allocation failed\n");
+ err = -ENOMEM;
+ goto destroy_loopb_wq;
+ }
+
+ qp = scifdev->qpairs;
+ qp->magic = SCIFEP_MAGIC;
+ spin_lock_init(&qp->qp_send_lock);
+ spin_lock_init(&qp->qp_recv_lock);
+ init_waitqueue_head(&scifdev->sd_mmap_wq);
+
+ local_q = kzalloc(NODE_QP_SIZE, GFP_KERNEL);
+ if (!local_q) {
+ printk(KERN_ERR "Ring Buffer Allocation Failed\n");
+ err = -ENOMEM;
+ goto free_qpairs;
+ }
+
+ /*
+ * For loopback the inbound_q and outbound_q are essentially the same
+ * since the Node sends a message on the loopback interface to the
+ * outbound_q which is then received on the inbound_q.
+ */
+ micscif_rb_init(&qp->outbound_q,
+ &(scifdev->qpairs[0].local_read),
+ &(scifdev->qpairs[0].local_write),
+ local_q,
+ NODE_QP_SIZE);
+
+ micscif_rb_init(&(qp->inbound_q),
+ &(scifdev->qpairs[0].local_read),
+ &(scifdev->qpairs[0].local_write),
+ local_q,
+ NODE_QP_SIZE);
+
+ /* Launch the micscif_rb test */
+#ifdef ENABLE_TEST
+ micscif_qp_testboth(scifdev);
+#endif
+ return err;
+free_qpairs:
+ kfree(scifdev->qpairs);
+destroy_loopb_wq:
+ destroy_workqueue(scifdev->sd_loopb_wq);
+destroy_intr_wq:
+ destroy_workqueue(scifdev->sd_intr_wq);
+error:
+ return err;
+}
+
+/**
+ * micscif_destroy_loopback_qp - One time uninit work for Loopback Node Qp
+ * @scifdev: SCIF device
+ *
+ * Detroys the workqueues and frees up the Ring Buffer and Queue Pair memory.
+ */
+int micscif_destroy_loopback_qp(struct micscif_dev *scifdev)
+{
+ micscif_destroy_interrupts(scifdev);
+ destroy_workqueue(scifdev->sd_loopb_wq);
+ kfree((void *)scifdev->qpairs->outbound_q.rb_base);
+ kfree(scifdev->qpairs);
+ return 0;
+}
+
+#ifndef _MIC_SCIF_
+void micscif_destroy_p2p(mic_ctx_t *mic_ctx)
+{
+ mic_ctx_t * mic_ctx_peer;
+ struct micscif_dev *mic_scif_dev;
+ struct micscif_dev *peer_dev;
+ struct scif_p2p_info *p2p;
+ struct list_head *pos, *tmp;
+ uint32_t bd;
+
+ if (!mic_p2p_enable)
+ return;
+
+
+ /* FIXME: implement node deletion */
+ mic_scif_dev = &scif_dev[mic_get_scifnode_id(mic_ctx)];
+
+ /* Free P2P mappings in the given node for all its peer nodes */
+ list_for_each_safe(pos, tmp, &mic_scif_dev->sd_p2p) {
+ p2p = list_entry(pos, struct scif_p2p_info,
+ ppi_list);
+
+ mic_unmap(mic_ctx->bi_id, p2p->ppi_mic_addr[PPI_MMIO],
+ p2p->ppi_len[PPI_MMIO] << PAGE_SHIFT);
+ mic_unmap(mic_ctx->bi_id, p2p->ppi_mic_addr[PPI_APER],
+ p2p->ppi_len[PPI_APER] << PAGE_SHIFT);
+ pci_unmap_sg(mic_ctx->bi_pdev,
+ p2p->ppi_sg[PPI_MMIO], p2p->sg_nentries[PPI_MMIO], PCI_DMA_BIDIRECTIONAL);
+ micscif_p2p_freesg(p2p->ppi_sg[PPI_MMIO]);
+ pci_unmap_sg(mic_ctx->bi_pdev,
+ p2p->ppi_sg[PPI_APER], p2p->sg_nentries[PPI_APER], PCI_DMA_BIDIRECTIONAL);
+ micscif_p2p_freesg(p2p->ppi_sg[PPI_APER]);
+ list_del(pos);
+ kfree(p2p);
+ }
+
+ /* Free P2P mapping created in the peer nodes for the given node */
+ for (bd = SCIF_HOST_NODE + 1; bd <= ms_info.mi_maxid; bd++) {
+ peer_dev = &scif_dev[bd];
+
+ list_for_each_safe(pos, tmp, &peer_dev->sd_p2p) {
+ p2p = list_entry(pos, struct scif_p2p_info,
+ ppi_list);
+ if (p2p->ppi_peer_id == mic_get_scifnode_id(mic_ctx)) {
+
+ mic_ctx_peer = get_per_dev_ctx(peer_dev->sd_node - 1);
+ mic_unmap(mic_ctx_peer->bi_id, p2p->ppi_mic_addr[PPI_MMIO],
+ p2p->ppi_len[PPI_MMIO] << PAGE_SHIFT);
+ mic_unmap(mic_ctx_peer->bi_id, p2p->ppi_mic_addr[PPI_APER],
+ p2p->ppi_len[PPI_APER] << PAGE_SHIFT);
+ pci_unmap_sg(mic_ctx_peer->bi_pdev,
+ p2p->ppi_sg[PPI_MMIO], p2p->sg_nentries[PPI_MMIO], PCI_DMA_BIDIRECTIONAL);
+ micscif_p2p_freesg(p2p->ppi_sg[PPI_MMIO]);
+ pci_unmap_sg(mic_ctx_peer->bi_pdev, p2p->ppi_sg[PPI_APER],
+ p2p->sg_nentries[PPI_APER], PCI_DMA_BIDIRECTIONAL);
+ micscif_p2p_freesg(p2p->ppi_sg[PPI_APER]);
+ list_del(pos);
+ kfree(p2p);
+ }
+ }
+ }
+}
+#endif
+
+/**
+ * ONLY TEST CODE BELOW
+ */
+#ifdef ENABLE_TEST
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include "mic/micscif_nodeqp.h"
+
+static void micscif_rb_trigger_consumer(struct work_struct *work)
+{
+ struct micscif_dev *scifdev = container_of(work, struct micscif_dev, consumer_work);
+
+ while (scifdev->test_done == 0) {
+ cpu_relax();
+ schedule();
+ }
+ if (scifdev->test_done != 1)
+ printk(KERN_ERR "Consumer failed!\n");
+ else
+ pr_debug("Test finished: Success\n");
+ scifdev->test_done = 2;
+}
+
+/**
+ * micscif_rb_trigger_producer
+ * This is the producer thread to create messages and update the
+ * RB write offset accordingly.
+ */
+static void micscif_rb_trigger_producer(struct work_struct *work)
+{
+ struct nodemsg msg;
+ int count = 0;
+ struct micscif_dev *scifdev = container_of(work, struct micscif_dev, producer_work);
+
+ msg.dst.node = scifdev->sd_node;
+ msg.uop = SCIF_TEST;
+
+ while (count <= TEST_LOOP) {
+ msg.payload[0] = count++;
+ micscif_nodeqp_send(scifdev, &msg, NULL);
+ /* pr_debug(("Prod payload %llu\n", msg.payload[0]); */
+ }
+}
+
+/* this is called from the host and the card at the same time on a queue pair.
+ * Each sets up a producer and a consumer and spins on the queue pair until done
+ */
+static void micscif_qp_testboth(struct micscif_dev *scifdev)
+{
+ scifdev->count = 0;
+ scifdev->test_done = 0;
+ snprintf(scifdev->producer_name, sizeof(scifdev->producer_name),
+ "PRODUCER %d", scifdev->sd_node);
+ snprintf(scifdev->consumer_name, sizeof(scifdev->consumer_name),
+ "CONSUMER %d", scifdev->sd_node);
+ scifdev->producer =
+ __mic_create_singlethread_workqueue(scifdev->producer_name);
+ scifdev->consumer =
+ __mic_create_singlethread_workqueue(scifdev->consumer_name);
+
+ INIT_WORK(&scifdev->producer_work, micscif_rb_trigger_producer);
+ INIT_WORK(&scifdev->consumer_work, micscif_rb_trigger_consumer);
+
+ queue_work(scifdev->producer, &scifdev->producer_work);
+ queue_work(scifdev->consumer, &scifdev->consumer_work);
+}
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * Port reservation mechnism.
+ * Since this goes with SCIF it must be available for any OS
+ * and should not consume IP ports. Therefore, roll our own.
+ * This is not required to be high performance, so a simple bit
+ * array should do just fine.
+ *
+ * API specification (loosely):
+ *
+ * uint16_t port
+ * Port number is a 16 bit unsigned integer
+ *
+ * uint16_t rsrv_scif_port(uint16_t)
+ * reserve specified port #
+ * returns port #, or 0 if port unavailable.
+ *
+ * uint16_t get_scif_port(void)
+ * reserve any available port #
+ * returns port #, or 0 if no ports available
+ *
+ * void put_scif_port(uint16_t)
+ * release port #
+ *
+ * Reserved ports comes from the lower end of the allocatable range,
+ * and is reserved only in the sense that get_scif_port() won't use
+ * them and there is only a predefined count of them available.
+ */
+
+#include <mic/micscif.h>
+
+/*
+ * Manifests
+ * Port counts must be an integer multiple of 64
+ */
+
+#define SCIF_PORT_BASE 0x0000 /* Start port (port reserved if 0) */
+#define SCIF_PORT_COUNT 0x10000 /* Ports available */
+
+#if SCIF_PORT_RSVD > (SCIF_PORT_COUNT/2)
+#error "No more than half of scif ports can be reserved !!"
+#endif
+#if (SCIF_PORT_BASE + SCIF_PORT_COUNT) > (2 << 16)
+#error "Scif ports cannot exceed 16 bit !!"
+#endif
+
+#include <linux/bitops.h>
+#include <linux/spinlock_types.h>
+static spinlock_t port_lock = __SPIN_LOCK_UNLOCKED(port_lock);
+
+/*
+ * Data structures
+ * init_array Flag for initialize (mark as init_code?)
+ * port_bits 1 bit representing each possible port.
+ * first_free Index into port_bits for free area
+ * port_lock Lock for exclusive access
+ * port_rsvd Total of successful "get/resv" calls.
+ * port_free Total of successful "free" calls.
+ * port_err Total of unsuccessfull calls.
+ */
+
+#define BITS_PR_PORT (8 * sizeof(uint64_t))
+#define PORTS_ARRAY_SIZE ((SCIF_PORT_COUNT + (BITS_PR_PORT - 1)) / BITS_PR_PORT)
+
+
+static int init_array = 1;
+static uint16_t first_free;
+static uint64_t port_bits[PORTS_ARRAY_SIZE];
+static uint64_t port_rsvd;
+static uint64_t port_free;
+static uint64_t port_err;
+
+
+/*
+ * Bitfield handlers.
+ *
+ * Need 3 bit-fiddlers to operate on individual bits within
+ * one 64 bit word in memory (always passing a pointer).
+ * Individual bits are enumerated from 1, allowing for use
+ * of value 0 to indicate an error condition.
+ *
+ * 1) __scif_ffsclr() returns index of first set bit in the
+ * 64 bit word and clears it. A return value 0 means there
+ * were no set bits in the word.
+ *
+ * 2) __scif_clrbit() clears a specified bit in the 64 bit word
+ * The bit index is returned if bit was previously set and a
+ * value 0 is returned if it was previously clear.
+ *
+ * 3) __scif_setbit() sets a specified bit in the 64 bit word.
+ *
+ * Two versions, one should work for you.
+ */
+
+#if 1 && (defined(__GNUC__) || defined(ICC))
+/*
+ * Use GNU style inline assembly for bit operations.
+ *
+ * Gcc complains about uninitialized use of variables
+ * big_bit in ffsclr and avl in clrbit. Generated code
+ * is correct, just haven't figured out the correct
+ * contraints yet.
+ *
+ * gcc -O2:
+ * __scif_ffsclr: 40 bytes
+ * __scif_clrbit: 34 bytes
+ * __scif_setbit: 17 bytes
+ */
+
+static int
+__scif_ffsclr(uint64_t *word)
+{
+ uint64_t big_bit = 0;
+ uint64_t field = *word;
+
+ asm volatile (
+ "bsfq %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movq $-1,%0\n"
+ "jmp 2f\n\t"
+ "1:\n\t"
+ "btrq %2,%1\n\t"
+ "2:"
+ : "=r" (big_bit), "=r" (field)
+ : "0" (big_bit), "1" (field)
+ );
+
+ if (big_bit == -1)
+ return 0;
+
+ *word = field;
+ return big_bit + 1;
+}
+
+static int
+__scif_clrbit(uint64_t *word, uint16_t bit)
+{
+ uint64_t field = *word;
+ uint64_t big_bit = bit;
+ int avl = 0;
+
+ big_bit--;
+ asm volatile (
+ "xorl %2,%2\n\t"
+ "btrq %3,%1\n\t"
+ "rcll $1,%2\n\t"
+ : "=Ir" (big_bit), "=r" (field), "=r" (avl)
+ : "0" (big_bit), "1" (field), "2" (avl)
+ );
+
+ *word = field;
+ return avl ? bit : 0;
+}
+
+static void
+__scif_setbit(uint64_t *word, uint16_t bit)
+{
+ uint64_t field = *word;
+ uint64_t big_bit = bit;
+
+ big_bit--;
+ asm volatile (
+ "btsq %2,%1"
+ : "=r" (field)
+ : "0" (field), "Jr" (big_bit)
+ );
+
+ *word = field;
+}
+#else
+/*
+ * C inliners for bit operations.
+ *
+ * gcc -O2:
+ * __scif_ffsclr: 50 bytes
+ * __scif_clrbit: 45 bytes
+ * __scif_setbit: 18 bytes
+ *
+ * WARNING:
+ * 1) ffsll() may be glibc specific
+ * 2) kernel ffs() use cmovz instruction that may not
+ * work in uOS kernel (see arch/x86/include/asm/bitops.h)
+ *
+ */
+
+
+static int
+__scif_ffsclr(uint64_t *word)
+{
+ int bit;
+/*
+ * ffsll() Find 1st bit in 64 bit word
+ */
+
+ bit = ffsll(*word);
+ if (bit)
+ *word &= ~(1LL << (bit - 1));
+
+ return bit;
+}
+
+static int
+__scif_clrbit(uint64_t *word, uint16_t bit)
+{
+ uint64_t msk = (1LL << (bit - 1));
+
+ if (*word & msk) {
+ *word &= ~msk;
+ return bit;
+ }
+ return 0;
+}
+
+static void
+__scif_setbit(uint64_t *word, uint16_t bit)
+{
+ *word |= (1LL << (bit - 1));
+}
+#endif
+
+
+static void
+init_scif_array(void)
+{
+ spin_lock(&port_lock);
+ if (init_array) {
+ int i;
+ for (i = 0; i < PORTS_ARRAY_SIZE; i++)
+ port_bits[i] = ~0;
+ first_free = SCIF_PORT_RSVD / BITS_PR_PORT;
+ if (!SCIF_PORT_BASE)
+ port_bits[0] ^= 1;
+ port_rsvd = 0;
+ port_free = 0;
+ port_err = 0;
+ init_array = 0;
+ }
+ spin_unlock(&port_lock);
+ pr_debug("SCIF port array init:\n"
+ " %d ports available starting at %d, %d reserved\n"
+ " Array consists of %ld %ld-bit wide integers\n",
+ SCIF_PORT_BASE ? SCIF_PORT_COUNT : SCIF_PORT_COUNT - 1,
+ SCIF_PORT_BASE ? SCIF_PORT_BASE : 1, SCIF_PORT_RSVD,
+ PORTS_ARRAY_SIZE, BITS_PR_PORT);
+}
+
+
+/*
+ * Reserve a specified port for SCIF
+ * TBD: doxyfy this header
+ */
+uint16_t
+rsrv_scif_port(uint16_t port)
+{
+ uint16_t port_ix;
+
+ if (!port) {
+ pr_debug("rsrv_scif_port: invalid port %d\n", port);
+ port_err++;
+ return 0;
+ }
+
+ if (init_array)
+ init_scif_array();
+
+ port -= SCIF_PORT_BASE;
+ port_ix = port / BITS_PR_PORT;
+
+ spin_lock(&port_lock);
+ port = __scif_clrbit(port_bits + port_ix, 1 + (port % BITS_PR_PORT));
+ if (port) {
+ port = port - 1 + BITS_PR_PORT * port_ix + SCIF_PORT_BASE;
+ port_rsvd++;
+ } else {
+ port_err++;
+ }
+ spin_unlock(&port_lock);
+
+ return port;
+}
+
+
+/*
+ * Get and reserve any port # for SCIF
+ * TBD: doxyfy this header
+ */
+uint16_t
+get_scif_port(void)
+{
+ uint16_t port;
+
+ if (init_array)
+ init_scif_array();
+
+ spin_lock(&port_lock);
+ if (first_free >= PORTS_ARRAY_SIZE) { /* Pool is empty */
+ port = 0;
+ port_err++;
+ goto out;
+ }
+ port = __scif_ffsclr(port_bits + first_free);
+ if (port) {
+ port = port - 1 + BITS_PR_PORT * first_free + SCIF_PORT_BASE;
+ while ((first_free < PORTS_ARRAY_SIZE) && !port_bits[first_free])
+ first_free++;
+ port_rsvd++;
+ } else
+ port_err++;
+out:
+ spin_unlock(&port_lock);
+ return port;
+}
+
+
+/*
+ * Release a reserved port # for SCIF
+ * For now, just ignore release on unreserved port
+ * TBD: doxyfy this header
+ */
+
+void
+put_scif_port(uint16_t port)
+{
+ uint16_t port_ix;
+
+ if (!port) {
+ pr_debug("put_scif_port: invalid port %d\n", port);
+ port_err++;
+ return;
+ }
+
+ port -= SCIF_PORT_BASE;
+ port_ix = port / BITS_PR_PORT;
+
+ spin_lock(&port_lock);
+ __scif_setbit(port_bits + port_ix, 1 + (port % BITS_PR_PORT));
+ if (port >= SCIF_PORT_RSVD && port_ix < first_free)
+ first_free = port_ix;
+ port_free++;
+ spin_unlock(&port_lock);
+}
+
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "mic/micscif.h"
+#include "mic/micscif_rb.h"
+
+#include <linux/circ_buf.h>
+#include <linux/module.h>
+#define count_in_ring(head, tail, size) CIRC_CNT(head, tail, size)
+#define space_in_ring(head, tail, size) CIRC_SPACE(head, tail, size)
+
+MODULE_LICENSE("GPL");
+
+static void *micscif_rb_get(struct micscif_rb *rb, uint32_t size);
+
+/**
+ * micscif_rb_init - To Initialize the RingBuffer
+ * @rb: The RingBuffer context
+ * @read_ptr: A pointer to the memory location containing
+ * the updated read pointer
+ * @write_ptr: A pointer to the memory location containing
+ * the updated write pointer
+ * @rb_base: The pointer to the ring buffer
+ * @size: The size of the ring buffer
+ */
+void micscif_rb_init(struct micscif_rb *rb,
+ volatile uint32_t *read_ptr,
+ volatile uint32_t *write_ptr,
+ volatile void *rb_base,
+ const uint32_t size)
+{
+ /* Size must be a power of two -- all logic assoicated with
+ * incrementing the read and write pointers relies on the size
+ * being a power of 2
+ */
+ BUG_ON((size & (size-1)) != 0);
+ rb->rb_base = rb_base;
+ rb->size = size;
+ rb->read_ptr = read_ptr;
+ rb->write_ptr = write_ptr;
+ rb->current_read_offset = *read_ptr;
+ rb->current_write_offset = *write_ptr;
+}
+EXPORT_SYMBOL(micscif_rb_init);
+
+/**
+ * micscif_rb_reset - To reset the RingBuffer
+ * @rb - The RingBuffer context
+ */
+void micscif_rb_reset(struct micscif_rb *rb)
+{
+ /*
+ * XPU_RACE_CONDITION: write followed by read
+ * MFENCE after write
+ * Read should take care of SBOX sync
+ * Ponters are volatile (see RingBuffer declaration)
+ */
+ *rb->read_ptr = 0x0;
+ *rb->write_ptr = 0x0;
+ smp_mb();
+ rb->current_write_offset = *rb->write_ptr;
+ rb->current_read_offset = *rb->read_ptr;
+}
+EXPORT_SYMBOL(micscif_rb_reset);
+
+/* Copies a message to the ring buffer -- handles the wrap around case */
+static int memcpy_torb(struct micscif_rb *rb, void *header,
+ void *msg, uint32_t size)
+{
+ /* Need to call two copies if it wraps around */
+ uint32_t size1, size2;
+ if ((char*)header + size >= (char*)rb->rb_base + rb->size) {
+ size1 = (uint32_t) ( ((char*)rb->rb_base + rb->size) - (char*)header);
+ size2 = size - size1;
+ memcpy_toio(header, msg, size1);
+ memcpy_toio(rb->rb_base, (char*)msg+size1, size2);
+ } else {
+ memcpy_toio(header, msg, size);
+ }
+ return 0;
+}
+
+/* Copies a message from the ring buffer -- handles the wrap around case */
+static int memcpy_fromrb(struct micscif_rb *rb, void *header,
+ void *msg, uint32_t size)
+{
+ /* Need to call two copies if it wraps around */
+ uint32_t size1, size2;
+ if ((char*)header + size >= (char*)rb->rb_base + rb->size) {
+ size1 = (uint32_t) ( ((char*)rb->rb_base + rb->size) - (char*)header );
+ size2 = size - size1;
+ memcpy_fromio(msg, header, size1);
+ memcpy_fromio((char*)msg+size1, rb->rb_base, size2);
+ } else {
+ memcpy_fromio(msg, header, size);
+ }
+ return 0;
+}
+
+/**
+ * micscif_rb_space -
+ * Query space available for writing to the given RB.
+ *
+ * @rb - The RingBuffer context
+ *
+ * Returns: size available for writing to RB in bytes.
+ */
+int micscif_rb_space(struct micscif_rb *rb)
+{
+ rb->old_current_read_offset = rb->current_read_offset;
+
+ rb->current_read_offset = *rb->read_ptr;
+ return space_in_ring(rb->current_write_offset,
+ rb->current_read_offset, rb->size);
+}
+EXPORT_SYMBOL(micscif_rb_space);
+
+/**
+ * micscif_rb_write - Write one package to the given ring buffer
+ * @rb - The RingBuffer context
+ * @msg - The package to be put in the ring buffer
+ * @size - the size (in bytes) you want to copy
+ *
+ * This API does not block if there isn't enough space in the RB.
+ */
+int micscif_rb_write(struct micscif_rb *rb,
+ void *msg,
+ uint32_t size)
+{
+ void *header;
+ int ret = 0;
+ if ((uint32_t)micscif_rb_space(rb) < size)
+ return -ENOMEM;
+ header = (char*)rb->rb_base + rb->current_write_offset;
+ ret = memcpy_torb(rb, header, msg, size);
+ if (!ret) {
+ /*
+ * XPU_RACE_CONDITION: Don't do anything here!
+ * Wait until micscif_rb_commit()
+ * Update the local ring buffer data, not the shared data until commit.
+ */
+ rb->old_current_write_offset = rb->current_write_offset;
+ rb->current_write_offset = (rb->current_write_offset + size) & (rb->size - 1);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(micscif_rb_write);
+
+/*
+ * micscif_rb_get_next
+ * Read from ring buffer.
+ * @rb - The RingBuffer context
+ * @msg - buffer to hold the message. Must be at least size bytes long
+ * @size - Size to be read out passed in, actual bytes read
+ * is returned.
+ * RETURN:
+ * Returns the number of bytes possible to read -- if retVal != size, then
+ * the read does not occur.
+ */
+int micscif_rb_get_next (struct micscif_rb *rb, void *msg, uint32_t size)
+{
+ void *header = NULL;
+ int read_size = 0;
+ /*
+ * warning: RingBufferGet() looks at the shared write pointer
+ */
+ header = micscif_rb_get(rb, size);
+ if (header) {
+ uint32_t next_cmd_offset =
+ (rb->current_read_offset + size) & (rb->size - 1);
+ read_size = size;
+ rb->old_current_read_offset = rb->current_read_offset;
+ rb->current_read_offset = next_cmd_offset;
+ if (memcpy_fromrb(rb, header, msg, size)) // add check here
+ return -EFAULT;
+ }
+ return read_size;
+}
+EXPORT_SYMBOL(micscif_rb_get_next);
+
+/**
+ * micscif_rb_update_read_ptr
+ * @rb - The RingBuffer context
+ */
+void micscif_rb_update_read_ptr(struct micscif_rb *rb)
+{
+ uint32_t old_offset;
+ uint32_t new_offset;
+ smp_mb();
+ old_offset = rb->old_current_read_offset;
+ new_offset = rb->current_read_offset;
+
+ /*
+ * XPU_RACE_CONDITION:
+ * pReadPointer is ready to move
+ * Moving read pointer transfers ownership to MIC
+ * What if MICCPU starts writing to buffer before all
+ * writes were flushed?
+ * Need to flush out all pending writes before pointer update
+ */
+ smp_mb();
+
+#ifdef CONFIG_ML1OM
+ serializing_request((volatile uint8_t*) rb->rb_base+old_offset);
+#endif
+
+ *rb->read_ptr = new_offset;
+#ifdef CONFIG_ML1OM
+ /*
+ * Readback since KNF doesn't guarantee that PCI ordering is maintained.
+ * Need a memory barrier on the host before the readback so the readback
+ * doesn't load from the write combining buffer but will go across to the
+ * PCI bus that will then flush the posted write to the device.
+ */
+ smp_mb();
+ serializing_request(rb->read_ptr);
+#endif
+#if defined(CONFIG_MK1OM) && defined(_MIC_SCIF_)
+ /*
+ * KNC Si HSD 3853952: For the case where a Core is performing an EXT_WR
+ * followed by a Doorbell Write, the Core must perform two EXT_WR to the
+ * same address with the same data before it does the Doorbell Write.
+ * This way, if ordering is violate for the Interrupt Message, it will
+ * fall just behind the first Posted associated with the first EXT_WR.
+ */
+ *rb->read_ptr = new_offset;
+#endif
+ smp_mb();
+}
+EXPORT_SYMBOL(micscif_rb_update_read_ptr);
+
+/**
+ * micscif_rb_count
+ * @rb - The RingBuffer context
+ * RETURN: number of empty slots in the RB
+ */
+uint32_t micscif_rb_count(struct micscif_rb *rb, uint32_t size)
+{
+ if (count_in_ring(rb->current_write_offset,
+ rb->current_read_offset,
+ rb->size) < size) {
+ /*
+ * Update from the HW write pointer if empty
+ */
+ rb->old_current_write_offset = rb->current_write_offset;
+ rb->current_write_offset = *rb->write_ptr;
+ }
+ return count_in_ring(rb->current_write_offset,
+ rb->current_read_offset,
+ rb->size);
+}
+EXPORT_SYMBOL(micscif_rb_count);
+
+/**
+ * micscif_rb_commit
+ * To submit the buffer to let the uOS to fetch it
+ * @rb - The RingBuffer context
+ */
+void micscif_rb_commit(struct micscif_rb *rb)
+{
+ /*
+ * XPU_RACE_CONDITION:
+ * Writing to ringbuffer memory before updating the pointer
+ * can be out-of-order and write combined.
+ * This is the point where we start to care about
+ * consistency of the data.
+ * There are two race conditions below:
+ * (1) Ring buffer pointer moves before all data is flushed:
+ * if uOS is late taking the interrupt for the previous transaction,
+ * it may take the new write pointer immediately
+ * and start accessing data in the ringbuffer.
+ * Ring buffer data must be consistent before we update the write
+ * pointer. We read back the address at oldCurrentWriteOffset
+ * -- this is the location in memory written during the last
+ * ring buffer operation; keep in mind that ring buffers and ring buffer
+ * pointers can be in different kinds of memory (host vs MIC,
+ * depending on currently active workaround flags.
+ * (2) If uOS takes interrupt while write pointer value is still
+ * in-flight may result in uOS reading old value, message being lost,
+ * and the deadlock. Must put another memory barrier after readback --
+ * revents read-passing-read from later read
+ */
+ smp_mb();
+#ifdef CONFIG_ML1OM
+ /*
+ * Also makes sure the following read is not reordered
+ */
+ serializing_request((char*)rb->rb_base + rb->current_write_offset);
+#endif
+ *rb->write_ptr = rb->current_write_offset;
+#ifdef CONFIG_ML1OM
+ /*
+ * Readback since KNF doesn't guarantee that PCI ordering is maintained.
+ * Need a memory barrier on the host before the readback so the readback
+ * doesn't load from the write combining buffer but will go across to the
+ * PCI bus that will then flush the posted write to the device.
+ */
+ smp_mb();
+ serializing_request(rb->write_ptr);
+#endif
+#if defined(CONFIG_MK1OM) && defined(_MIC_SCIF_)
+ /*
+ * KNC Si HSD 3853952: For the case where a Core is performing an EXT_WR
+ * followed by a Doorbell Write, the Core must perform two EXT_WR to the
+ * same address with the same data before it does the Doorbell Write.
+ * This way, if ordering is violate for the Interrupt Message, it will
+ * fall just behind the first Posted associated with the first EXT_WR.
+ */
+ *rb->write_ptr = rb->current_write_offset;
+#endif
+ smp_mb();
+}
+EXPORT_SYMBOL(micscif_rb_commit);
+
+/**
+ * micscif_rb_get
+ * To get next packet from the ring buffer
+ * @rb - The RingBuffer context
+ * RETURN:
+ * NULL if no packet in the ring buffer
+ * Otherwise The pointer of the next packet
+ */
+static void *micscif_rb_get(struct micscif_rb *rb, uint32_t size)
+{
+ void *header = NULL;
+
+ if (micscif_rb_count(rb, size) >= size)
+ header = (char*)rb->rb_base + rb->current_read_offset;
+ return header;
+}
+
+/**
+ * micscif_rb_get_version
+ * Return the ring buffer module version
+ */
+uint16_t micscif_rb_get_version(void)
+{
+ return RING_BUFFER_VERSION;
+}
+EXPORT_SYMBOL(micscif_rb_get_version);
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "mic/micscif.h"
+#include "mic/micscif_smpt.h"
+#include "mic/micscif_kmem_cache.h"
+#include "mic/micscif_rma_list.h"
+#ifndef _MIC_SCIF_
+#include "mic_common.h"
+#endif
+#include "mic/mic_dma_api.h"
+#include "mic/micscif_map.h"
+
+bool mic_reg_cache_enable = 0;
+
+bool mic_huge_page_enable = 1;
+
+#ifdef _MIC_SCIF_
+mic_dma_handle_t mic_dma_handle;
+#endif
+static inline
+void micscif_rma_destroy_tcw(struct rma_mmu_notifier *mmn,
+ struct endpt *ep, bool inrange,
+ uint64_t start, uint64_t len);
+#ifdef CONFIG_MMU_NOTIFIER
+static void scif_mmu_notifier_release(struct mmu_notifier *mn,
+ struct mm_struct *mm);
+static void scif_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address);
+static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end);
+static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end);
+static const struct mmu_notifier_ops scif_mmu_notifier_ops = {
+ .release = scif_mmu_notifier_release,
+ .clear_flush_young = NULL,
+ .change_pte = NULL,/*TODO*/
+ .invalidate_page = scif_mmu_notifier_invalidate_page,
+ .invalidate_range_start = scif_mmu_notifier_invalidate_range_start,
+ .invalidate_range_end = scif_mmu_notifier_invalidate_range_end};
+
+static void scif_mmu_notifier_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct endpt *ep;
+ struct rma_mmu_notifier *mmn;
+ mmn = container_of(mn, struct rma_mmu_notifier, ep_mmu_notifier);
+ ep = mmn->ep;
+ micscif_rma_destroy_tcw(mmn, ep, false, 0, 0);
+ pr_debug("%s\n", __func__);
+ return;
+}
+
+static void scif_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct endpt *ep;
+ struct rma_mmu_notifier *mmn;
+ mmn = container_of(mn, struct rma_mmu_notifier, ep_mmu_notifier);
+ ep = mmn->ep;
+ micscif_rma_destroy_tcw(mmn, ep, true, address, PAGE_SIZE);
+ pr_debug("%s address 0x%lx\n", __func__, address);
+ return;
+}
+
+static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct endpt *ep;
+ struct rma_mmu_notifier *mmn;
+ mmn = container_of(mn, struct rma_mmu_notifier, ep_mmu_notifier);
+ ep = mmn->ep;
+ micscif_rma_destroy_tcw(mmn, ep, true, (uint64_t)start, (uint64_t)(end - start));
+ pr_debug("%s start=%lx, end=%lx\n", __func__, start, end);
+ return;
+}
+
+static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ /* Nothing to do here, everything needed was done in invalidate_range_start */
+ pr_debug("%s\n", __func__);
+ return;
+}
+#endif
+
+#ifdef CONFIG_MMU_NOTIFIER
+void ep_unregister_mmu_notifier(struct endpt *ep)
+{
+ struct endpt_rma_info *rma = &ep->rma_info;
+ struct rma_mmu_notifier *mmn = NULL;
+ struct list_head *item, *tmp;
+ mutex_lock(&ep->rma_info.mmn_lock);
+ list_for_each_safe(item, tmp, &rma->mmn_list) {
+ mmn = list_entry(item,
+ struct rma_mmu_notifier, list_member);
+ mmu_notifier_unregister(&mmn->ep_mmu_notifier, mmn->mm);
+#ifdef RMA_DEBUG
+ BUG_ON(atomic_long_sub_return(1, &ms_info.mmu_notif_cnt) < 0);
+#endif
+ list_del(item);
+ kfree(mmn);
+ }
+ mutex_unlock(&ep->rma_info.mmn_lock);
+}
+
+static void init_mmu_notifier(struct rma_mmu_notifier *mmn, struct mm_struct *mm, struct endpt *ep)
+{
+ mmn->ep = ep;
+ mmn->mm = mm;
+ mmn->ep_mmu_notifier.ops = &scif_mmu_notifier_ops;
+ INIT_LIST_HEAD(&mmn->list_member);
+ INIT_LIST_HEAD(&mmn->tc_reg_list);
+}
+
+static struct rma_mmu_notifier *find_mmu_notifier(struct mm_struct *mm, struct endpt_rma_info *rma)
+{
+ struct rma_mmu_notifier *mmn;
+ struct list_head *item;
+ list_for_each(item, &rma->mmn_list) {
+ mmn = list_entry(item,
+ struct rma_mmu_notifier, list_member);
+ if (mmn->mm == mm)
+ return mmn;
+ }
+ return NULL;
+}
+#endif
+
+/**
+ * micscif_rma_ep_init:
+ * @ep: end point
+ *
+ * Initialize RMA per EP data structures.
+ */
+int micscif_rma_ep_init(struct endpt *ep)
+{
+ int ret;
+ struct endpt_rma_info *rma = &ep->rma_info;
+
+ mutex_init (&rma->rma_lock);
+ if ((ret = va_gen_init(&rma->va_gen,
+ VA_GEN_MIN, VA_GEN_RANGE)) < 0)
+ goto init_err;
+ spin_lock_init(&rma->tc_lock);
+ mutex_init (&rma->mmn_lock);
+ mutex_init (&rma->va_lock);
+ INIT_LIST_HEAD(&rma->reg_list);
+ INIT_LIST_HEAD(&rma->remote_reg_list);
+ atomic_set(&rma->tw_refcount, 0);
+ atomic_set(&rma->tw_total_pages, 0);
+ atomic_set(&rma->tcw_refcount, 0);
+ atomic_set(&rma->tcw_total_pages, 0);
+ init_waitqueue_head(&rma->fence_wq);
+ rma->fence_refcount = 0;
+ rma->async_list_del = 0;
+ rma->dma_chan = NULL;
+ INIT_LIST_HEAD(&rma->mmn_list);
+ INIT_LIST_HEAD(&rma->task_list);
+init_err:
+ return ret;
+}
+
+/**
+ * micscif_rma_ep_can_uninit:
+ * @ep: end point
+ *
+ * Returns 1 if an endpoint can be uninitialized and 0 otherwise.
+ */
+int micscif_rma_ep_can_uninit(struct endpt *ep)
+{
+ int ret = 0;
+
+ /* Destroy RMA Info only if both lists are empty */
+ if (list_empty(&ep->rma_info.reg_list) &&
+ list_empty(&ep->rma_info.remote_reg_list) &&
+#ifdef CONFIG_MMU_NOTIFIER
+ list_empty(&ep->rma_info.mmn_list) &&
+#endif
+ !atomic_read(&ep->rma_info.tw_refcount) &&
+ !atomic_read(&ep->rma_info.tcw_refcount))
+ ret = 1;
+ return ret;
+}
+
+#ifdef _MIC_SCIF_
+/**
+ * __micscif_setup_proxy_dma:
+ * @ep: SCIF endpoint descriptor.
+ *
+ * Sets up data structures for P2P Proxy DMAs.
+ */
+static int __micscif_setup_proxy_dma(struct endpt *ep)
+{
+ struct endpt_rma_info *rma = &ep->rma_info;
+ int err = 0;
+ uint64_t *tmp = NULL;
+
+ mutex_lock(&rma->rma_lock);
+ if (is_p2p_scifdev(ep->remote_dev) && !rma->proxy_dma_va) {
+ if (!(tmp = scif_zalloc(PAGE_SIZE))) {
+ err = -ENOMEM;
+ goto error;
+ }
+ if ((err = map_virt_into_aperture(&rma->proxy_dma_phys,
+ tmp,
+ ep->remote_dev, PAGE_SIZE))) {
+ scif_free(tmp, PAGE_SIZE);
+ goto error;
+ }
+ *tmp = OP_IDLE;
+ rma->proxy_dma_va = tmp;
+ }
+error:
+ mutex_unlock(&rma->rma_lock);
+ return err;
+}
+
+static __always_inline int micscif_setup_proxy_dma(struct endpt *ep)
+{
+ if (ep->rma_info.proxy_dma_va)
+ return 0;
+
+ return __micscif_setup_proxy_dma(ep);
+}
+
+/**
+ * micscif_teardown_proxy_dma:
+ * @ep: SCIF endpoint descriptor.
+ *
+ * Tears down data structures setup for P2P Proxy DMAs.
+ */
+void micscif_teardown_proxy_dma(struct endpt *ep)
+{
+ struct endpt_rma_info *rma = &ep->rma_info;
+ mutex_lock(&rma->rma_lock);
+ if (rma->proxy_dma_va) {
+ unmap_from_aperture(rma->proxy_dma_phys, ep->remote_dev, PAGE_SIZE);
+ scif_free(rma->proxy_dma_va, PAGE_SIZE);
+ rma->proxy_dma_va = NULL;
+ }
+ mutex_unlock(&rma->rma_lock);
+}
+
+/**
+ * micscif_proxy_dma:
+ * @ep: SCIF endpoint descriptor.
+ * @copy_work: DMA copy work information.
+ *
+ * This API does the following:
+ * 1) Sends the peer a SCIF Node QP message with the information
+ * required to program a proxy DMA to covert a P2P Read to a Write
+ * which will initiate a DMA transfer from the peer card to self.
+ * The reason for this special code path is KNF and KNC P2P read
+ * performance being much lower than P2P write performance on Crown
+ * Pass platforms.
+ * 2) Poll for an update of the known proxy dma VA to OP_COMPLETED
+ * via a SUD by the peer.
+ */
+static int micscif_proxy_dma(scif_epd_t epd, struct mic_copy_work *work)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ struct nodemsg msg;
+ unsigned long ts = jiffies;
+ struct endpt_rma_info *rma = &ep->rma_info;
+ int err;
+ volatile uint64_t *proxy_dma_va = rma->proxy_dma_va;
+
+ mutex_lock(&ep->rma_info.rma_lock);
+ /*
+ * Bail out if there is a Proxy DMA already in progress
+ * for this endpoint. The callee will fallback on self
+ * DMAs upon an error.
+ */
+ if (*proxy_dma_va != OP_IDLE) {
+ mutex_unlock(&ep->rma_info.rma_lock);
+ err = -EBUSY;
+ goto error;
+ }
+ *proxy_dma_va = OP_IN_PROGRESS;
+ mutex_unlock(&ep->rma_info.rma_lock);
+
+ msg.src = ep->port;
+ msg.uop = work->ordered ? SCIF_PROXY_ORDERED_DMA : SCIF_PROXY_DMA;
+ msg.payload[0] = ep->remote_ep;
+ msg.payload[1] = work->src_offset;
+ msg.payload[2] = work->dst_offset;
+ msg.payload[3] = work->len;
+
+ if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
+ goto error_init_va;
+
+ while (*proxy_dma_va != OP_COMPLETED) {
+ schedule();
+ if (time_after(jiffies,
+ ts + NODE_ALIVE_TIMEOUT)) {
+ err = -EBUSY;
+ goto error_init_va;
+ }
+ }
+ err = 0;
+error_init_va:
+ *proxy_dma_va = OP_IDLE;
+error:
+ return err;
+}
+#endif
+
+/**
+ * micscif_create_pinned_pages:
+ * @nr_pages: number of pages in window
+ * @prot: read/write protection
+ *
+ * Allocate and prepare a set of pinned pages.
+ */
+struct scif_pinned_pages *micscif_create_pinned_pages(int nr_pages, int prot)
+{
+ struct scif_pinned_pages *pinned_pages;
+
+ might_sleep();
+ if (!(pinned_pages = scif_zalloc(sizeof(*pinned_pages))))
+ goto error;
+
+ if (!(pinned_pages->pages = scif_zalloc(nr_pages *
+ sizeof(*(pinned_pages->pages)))))
+ goto error_free_pinned_pages;
+
+ if (!(pinned_pages->num_pages = scif_zalloc(nr_pages *
+ sizeof(*(pinned_pages->num_pages)))))
+ goto error_free_pages;
+
+#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HUGETLB_PAGE) && !defined(_MIC_SCIF_)
+ if (!(pinned_pages->vma = scif_zalloc(nr_pages *
+ sizeof(*(pinned_pages->vma)))))
+ goto error_free_num_pages;
+#endif
+
+ pinned_pages->prot = prot;
+ pinned_pages->magic = SCIFEP_MAGIC;
+ pinned_pages->nr_contig_chunks = 0;
+ return pinned_pages;
+
+#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HUGETLB_PAGE) && !defined(_MIC_SCIF_)
+error_free_num_pages:
+ scif_free(pinned_pages->num_pages,
+ pinned_pages->nr_pages * sizeof(*(pinned_pages->num_pages)));
+#endif
+error_free_pages:
+ scif_free(pinned_pages->pages,
+ pinned_pages->nr_pages * sizeof(*(pinned_pages->pages)));
+error_free_pinned_pages:
+ scif_free(pinned_pages, sizeof(*pinned_pages));
+error:
+ return NULL;
+}
+
+/**
+ * micscif_destroy_pinned_pages:
+ * @pinned_pages: A set of pinned pages.
+ *
+ * Deallocate resources for pinned pages.
+ */
+int micscif_destroy_pinned_pages(struct scif_pinned_pages *pinned_pages)
+{
+ int j;
+ int writeable = pinned_pages->prot & SCIF_PROT_WRITE;
+ int kernel = SCIF_MAP_KERNEL & pinned_pages->map_flags;
+
+ for (j = 0; j < pinned_pages->nr_pages; j++) {
+ if (pinned_pages->pages[j]) {
+ if (!kernel) {
+ if (writeable)
+ SetPageDirty(pinned_pages->pages[j]);
+#ifdef RMA_DEBUG
+ BUG_ON(!page_count(pinned_pages->pages[j]));
+ BUG_ON(atomic_long_sub_return(1, &ms_info.rma_pin_cnt) < 0);
+#endif
+ page_cache_release(pinned_pages->pages[j]);
+ }
+ }
+ }
+
+#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HUGETLB_PAGE) && !defined(_MIC_SCIF_)
+ scif_free(pinned_pages->vma,
+ pinned_pages->nr_pages * sizeof(*(pinned_pages->vma)));
+#endif
+ scif_free(pinned_pages->pages,
+ pinned_pages->nr_pages * sizeof(*(pinned_pages->pages)));
+ scif_free(pinned_pages->num_pages,
+ pinned_pages->nr_pages * sizeof(*(pinned_pages->num_pages)));
+ scif_free(pinned_pages, sizeof(*pinned_pages));
+ return 0;
+}
+
+/*
+ * micscif_create_window:
+ * @ep: end point
+ * @pinned_pages: Set of pinned pages which wil back this window.
+ * @offset: offset hint
+ *
+ * Allocate and prepare a self registration window.
+ */
+struct reg_range_t *micscif_create_window(struct endpt *ep,
+ int64_t nr_pages, uint64_t offset, bool temp)
+{
+ struct reg_range_t *window;
+
+ might_sleep();
+ if (!(window = scif_zalloc(sizeof(struct reg_range_t))))
+ goto error;
+
+#ifdef CONFIG_ML1OM
+ if (!temp) {
+ if (!(window->phys_addr = scif_zalloc(nr_pages *
+ sizeof(*(window->phys_addr)))))
+ goto error_free_window;
+
+ if (!(window->temp_phys_addr = scif_zalloc(nr_pages *
+ sizeof(*(window->temp_phys_addr)))))
+ goto error_free_window;
+ }
+#endif
+
+ if (!(window->dma_addr = scif_zalloc(nr_pages *
+ sizeof(*(window->dma_addr)))))
+ goto error_free_window;
+
+ if (!(window->num_pages = scif_zalloc(nr_pages *
+ sizeof(*(window->num_pages)))))
+ goto error_free_window;
+
+ window->offset = offset;
+ window->ep = (uint64_t)ep;
+ window->magic = SCIFEP_MAGIC;
+ window->reg_state = OP_IDLE;
+ init_waitqueue_head(&window->regwq);
+ window->unreg_state = OP_IDLE;
+ init_waitqueue_head(&window->unregwq);
+ INIT_LIST_HEAD(&window->list_member);
+ window->type = RMA_WINDOW_SELF;
+ window->temp = temp;
+#ifdef _MIC_SCIF_
+ micscif_setup_proxy_dma(ep);
+#endif
+ return window;
+
+error_free_window:
+ if (window->dma_addr)
+ scif_free(window->dma_addr, nr_pages * sizeof(*(window->dma_addr)));
+#ifdef CONFIG_ML1OM
+ if (window->temp_phys_addr)
+ scif_free(window->temp_phys_addr, nr_pages * sizeof(*(window->temp_phys_addr)));
+ if (window->phys_addr)
+ scif_free(window->phys_addr, nr_pages * sizeof(*(window->phys_addr)));
+#endif
+ scif_free(window, sizeof(*window));
+error:
+ return NULL;
+}
+
+/**
+ * micscif_destroy_incomplete_window:
+ * @ep: end point
+ * @window: registration window
+ *
+ * Deallocate resources for self window.
+ */
+int micscif_destroy_incomplete_window(struct endpt *ep, struct reg_range_t *window)
+{
+ int err;
+ int64_t nr_pages = window->nr_pages;
+ struct allocmsg *alloc = &window->alloc_handle;
+ struct nodemsg msg;
+
+ RMA_MAGIC(window);
+retry:
+ err = wait_event_timeout(alloc->allocwq, alloc->state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT);
+ if (!err && scifdev_alive(ep))
+ goto retry;
+
+ if (OP_COMPLETED == alloc->state) {
+ msg.uop = SCIF_FREE_VIRT;
+ msg.src = ep->port;
+ msg.payload[0] = ep->remote_ep;
+ msg.payload[1] = (uint64_t)window->alloc_handle.vaddr;
+ msg.payload[2] = (uint64_t)window;
+ msg.payload[3] = SCIF_REGISTER;
+ micscif_nodeqp_send(ep->remote_dev, &msg, ep);
+ }
+
+ micscif_free_window_offset(ep, window->offset,
+ window->nr_pages << PAGE_SHIFT);
+ if (window->dma_addr)
+ scif_free(window->dma_addr, nr_pages *
+ sizeof(*(window->dma_addr)));
+ if (window->num_pages)
+ scif_free(window->num_pages, nr_pages *
+ sizeof(*(window->num_pages)));
+#ifdef CONFIG_ML1OM
+ if (window->phys_addr)
+ scif_free(window->phys_addr, window->nr_pages *
+ sizeof(*(window->phys_addr)));
+ if (window->temp_phys_addr)
+ scif_free(window->temp_phys_addr, nr_pages *
+ sizeof(*(window->temp_phys_addr)));
+#endif
+ scif_free(window, sizeof(*window));
+ return 0;
+}
+
+/**
+ * micscif_destroy_window:
+ * @ep: end point
+ * @window: registration window
+ *
+ * Deallocate resources for self window.
+ */
+int micscif_destroy_window(struct endpt *ep, struct reg_range_t *window)
+{
+ int j;
+ struct scif_pinned_pages *pinned_pages = window->pinned_pages;
+ int64_t nr_pages = window->nr_pages;
+
+ might_sleep();
+ RMA_MAGIC(window);
+ if (!window->temp && window->mm) {
+ __scif_dec_pinned_vm_lock(window->mm, window->nr_pages, 0);
+ __scif_release_mm(window->mm);
+ window->mm = NULL;
+ }
+
+ if (!window->offset_freed)
+ micscif_free_window_offset(ep, window->offset,
+ window->nr_pages << PAGE_SHIFT);
+ for (j = 0; j < window->nr_contig_chunks; j++) {
+ if (window->dma_addr[j]) {
+ unmap_from_aperture(
+ window->dma_addr[j],
+ ep->remote_dev,
+ window->num_pages[j] << PAGE_SHIFT);
+ }
+ }
+
+ /*
+ * Decrement references for this set of pinned pages from
+ * this window.
+ */
+ j = atomic_sub_return((int32_t)pinned_pages->nr_pages,
+ &pinned_pages->ref_count);
+ BUG_ON(j < 0);
+ /*
+ * If the ref count for pinned_pages is zero then someone
+ * has already called scif_unpin_pages() for it and we should
+ * destroy the page cache.
+ */
+ if (!j)
+ micscif_destroy_pinned_pages(window->pinned_pages);
+ if (window->dma_addr)
+ scif_free(window->dma_addr, nr_pages *
+ sizeof(*(window->dma_addr)));
+ if (window->num_pages)
+ scif_free(window->num_pages, nr_pages *
+ sizeof(*(window->num_pages)));
+#ifdef CONFIG_ML1OM
+ if (window->phys_addr)
+ scif_free(window->phys_addr, window->nr_pages *
+ sizeof(*(window->phys_addr)));
+ if (window->temp_phys_addr)
+ scif_free(window->temp_phys_addr, nr_pages *
+ sizeof(*(window->temp_phys_addr)));
+#endif
+ window->magic = 0;
+ scif_free(window, sizeof(*window));
+ return 0;
+}
+
+/**
+ * micscif_create_remote_lookup:
+ * @ep: end point
+ * @window: remote window
+ *
+ * Allocate and prepare lookup entries for the remote
+ * end to copy over the physical addresses.
+ * Returns 0 on success and appropriate errno on failure.
+ */
+int micscif_create_remote_lookup(struct endpt *ep, struct reg_range_t *window)
+{
+ int i, j, err = 0;
+ int64_t nr_pages = window->nr_pages;
+ bool vmalloc_dma_phys;
+#ifdef CONFIG_ML1OM
+ bool vmalloc_temp_phys = false;
+ bool vmalloc_phys = false;
+#endif
+ might_sleep();
+
+ /* Map window */
+ err = map_virt_into_aperture(&window->mapped_offset,
+ window, ep->remote_dev, sizeof(*window));
+ if (err)
+ goto error_window;
+
+ /* Compute the number of lookup entries. 21 == 2MB Shift */
+ window->nr_lookup = ALIGN(nr_pages * PAGE_SIZE,
+ ((2) * 1024 * 1024)) >> 21;
+
+ if (!(window->dma_addr_lookup.lookup =
+ scif_zalloc(window->nr_lookup *
+ sizeof(*(window->dma_addr_lookup.lookup)))))
+ goto error_window;
+
+ /* Map DMA physical addess lookup array */
+ err = map_virt_into_aperture(&window->dma_addr_lookup.offset,
+ window->dma_addr_lookup.lookup, ep->remote_dev,
+ window->nr_lookup *
+ sizeof(*window->dma_addr_lookup.lookup));
+ if (err)
+ goto error_window;
+
+ vmalloc_dma_phys = is_vmalloc_addr(&window->dma_addr[0]);
+
+#ifdef CONFIG_ML1OM
+ if (ep->remote_dev != &scif_dev[SCIF_HOST_NODE] && !is_self_scifdev(ep->remote_dev)) {
+ if (!(window->temp_phys_addr_lookup.lookup =
+ scif_zalloc(window->nr_lookup *
+ sizeof(*(window->temp_phys_addr_lookup.lookup)))))
+ goto error_window;
+
+ /* Map physical addess lookup array */
+ err = map_virt_into_aperture(&window->temp_phys_addr_lookup.offset,
+ window->temp_phys_addr_lookup.lookup, ep->remote_dev,
+ window->nr_lookup *
+ sizeof(*window->temp_phys_addr_lookup.lookup));
+ if (err)
+ goto error_window;
+
+ if (!(window->phys_addr_lookup.lookup =
+ scif_zalloc(window->nr_lookup *
+ sizeof(*(window->phys_addr_lookup.lookup)))))
+ goto error_window;
+
+ /* Map physical addess lookup array */
+ err = map_virt_into_aperture(&window->phys_addr_lookup.offset,
+ window->phys_addr_lookup.lookup, ep->remote_dev,
+ window->nr_lookup *
+ sizeof(*window->phys_addr_lookup.lookup));
+ if (err)
+ goto error_window;
+
+ vmalloc_phys = is_vmalloc_addr(&window->phys_addr[0]);
+ vmalloc_temp_phys = is_vmalloc_addr(&window->temp_phys_addr[0]);
+ }
+#endif
+
+ /* Now map each of the pages containing physical addresses */
+ for (i = 0, j = 0; i < nr_pages; i += NR_PHYS_ADDR_IN_PAGE, j++) {
+#ifdef CONFIG_ML1OM
+ if (ep->remote_dev != &scif_dev[SCIF_HOST_NODE] && !is_self_scifdev(ep->remote_dev)) {
+ err = map_page_into_aperture(
+ &window->temp_phys_addr_lookup.lookup[j],
+ vmalloc_temp_phys ?
+ vmalloc_to_page(&window->temp_phys_addr[i]) :
+ virt_to_page(&window->temp_phys_addr[i]),
+ ep->remote_dev);
+ if (err)
+ goto error_window;
+
+ err = map_page_into_aperture(
+ &window->phys_addr_lookup.lookup[j],
+ vmalloc_phys ?
+ vmalloc_to_page(&window->phys_addr[i]) :
+ virt_to_page(&window->phys_addr[i]),
+ ep->remote_dev);
+ if (err)
+ goto error_window;
+ }
+#endif
+ err = map_page_into_aperture(
+ &window->dma_addr_lookup.lookup[j],
+ vmalloc_dma_phys ?
+ vmalloc_to_page(&window->dma_addr[i]) :
+ virt_to_page(&window->dma_addr[i]),
+ ep->remote_dev);
+ if (err)
+ goto error_window;
+ }
+ return 0;
+error_window:
+ return err;
+}
+
+/**
+ * micscif_destroy_remote_lookup:
+ * @ep: end point
+ * @window: remote window
+ *
+ * Destroy lookup entries used for the remote
+ * end to copy over the physical addresses.
+ */
+void micscif_destroy_remote_lookup(struct endpt *ep, struct reg_range_t *window)
+{
+ int i, j;
+
+ RMA_MAGIC(window);
+ if (window->nr_lookup) {
+ for (i = 0, j = 0; i < window->nr_pages;
+ i += NR_PHYS_ADDR_IN_PAGE, j++) {
+ if (window->dma_addr_lookup.lookup &&
+ window->dma_addr_lookup.lookup[j]) {
+ unmap_from_aperture(
+ window->dma_addr_lookup.lookup[j],
+ ep->remote_dev, PAGE_SIZE);
+ }
+ }
+ if (window->dma_addr_lookup.offset) {
+ unmap_from_aperture(
+ window->dma_addr_lookup.offset,
+ ep->remote_dev, window->nr_lookup *
+ sizeof(*window->dma_addr_lookup.lookup));
+ }
+ if (window->dma_addr_lookup.lookup)
+ scif_free(window->dma_addr_lookup.lookup, window->nr_lookup *
+ sizeof(*(window->dma_addr_lookup.lookup)));
+ if (window->mapped_offset) {
+ unmap_from_aperture(window->mapped_offset,
+ ep->remote_dev, sizeof(*window));
+ }
+ window->nr_lookup = 0;
+ }
+}
+
+/**
+ * micscif_create_remote_window:
+ * @ep: end point
+ * @nr_pages: number of pages in window
+ *
+ * Allocate and prepare a remote registration window.
+ */
+struct reg_range_t *micscif_create_remote_window(struct endpt *ep, int nr_pages)
+{
+ struct reg_range_t *window;
+
+ might_sleep();
+ if (!(window = scif_zalloc(sizeof(struct reg_range_t))))
+ goto error_ret;
+
+ window->magic = SCIFEP_MAGIC;
+ window->nr_pages = nr_pages;
+
+#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
+ if (!(window->page_ref_count = scif_zalloc(nr_pages *
+ sizeof(*(window->page_ref_count)))))
+ goto error_window;
+#endif
+
+ if (!(window->dma_addr = scif_zalloc(nr_pages *
+ sizeof(*(window->dma_addr)))))
+ goto error_window;
+
+ if (!(window->num_pages = scif_zalloc(nr_pages *
+ sizeof(*(window->num_pages)))))
+ goto error_window;
+
+#ifdef CONFIG_ML1OM
+ if (!(window->phys_addr = scif_zalloc(nr_pages *
+ sizeof(*(window->phys_addr)))))
+ goto error_window;
+
+ if (!(window->temp_phys_addr = scif_zalloc(nr_pages *
+ sizeof(*(window->temp_phys_addr)))))
+ goto error_window;
+#endif
+
+ if (micscif_create_remote_lookup(ep, window))
+ goto error_window;
+
+ window->ep = (uint64_t)ep;
+ window->type = RMA_WINDOW_PEER;
+ set_window_ref_count(window, nr_pages);
+ window->get_put_ref_count = 0;
+ window->unreg_state = OP_IDLE;
+#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
+ window->gttmap_state = OP_IDLE;
+ init_waitqueue_head(&window->gttmapwq);
+#endif
+#ifdef _MIC_SCIF_
+ micscif_setup_proxy_dma(ep);
+ window->proxy_dma_phys = ep->rma_info.proxy_dma_phys;
+#endif
+ return window;
+error_window:
+ micscif_destroy_remote_window(ep, window);
+error_ret:
+ return NULL;
+}
+
+/**
+ * micscif_destroy_remote_window:
+ * @ep: end point
+ * @window: remote registration window
+ *
+ * Deallocate resources for remote window.
+ */
+void micscif_destroy_remote_window(struct endpt *ep, struct reg_range_t *window)
+{
+ RMA_MAGIC(window);
+ micscif_destroy_remote_lookup(ep, window);
+ if (window->dma_addr)
+ scif_free(window->dma_addr, window->nr_pages *
+ sizeof(*(window->dma_addr)));
+ if (window->num_pages)
+ scif_free(window->num_pages, window->nr_pages *
+ sizeof(*(window->num_pages)));
+#ifdef CONFIG_ML1OM
+ if (window->phys_addr)
+ scif_free(window->phys_addr, window->nr_pages *
+ sizeof(*(window->phys_addr)));
+ if (window->temp_phys_addr)
+ scif_free(window->temp_phys_addr, window->nr_pages *
+ sizeof(*(window->temp_phys_addr)));
+#endif
+
+#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
+ if (window->page_ref_count)
+ scif_free(window->page_ref_count, window->nr_pages *
+ sizeof(*(window->page_ref_count)));
+#endif
+ window->magic = 0;
+ scif_free(window, sizeof(*window));
+}
+
+/**
+ * micscif_map_window_pages:
+ * @ep: end point
+ * @window: self registration window
+ * @tmp_wnd: is a temporary window?
+ *
+ * Map pages of a window into the aperture/PCI.
+ * Also compute physical addresses required for DMA.
+ */
+int micscif_map_window_pages(struct endpt *ep, struct reg_range_t *window, bool tmp_wnd)
+{
+ int j, i, err = 0, nr_pages;
+ scif_pinned_pages_t pinned_pages;
+
+ might_sleep();
+ RMA_MAGIC(window);
+
+ pinned_pages = window->pinned_pages;
+ for (j = 0, i = 0; j < window->nr_contig_chunks; j++, i += nr_pages) {
+ nr_pages = pinned_pages->num_pages[i];
+#ifdef _MIC_SCIF_
+#ifdef CONFIG_ML1OM
+ /* phys_addr[] holds addresses as seen from the remote node
+ * these addressed are then copied into the remote card's
+ * window structure
+ * when the remote node is the host and the card is knf
+ * these addresses are only created at the point of mapping
+ * the card physical address into gtt (for the KNC the
+ * the gtt code path returns the local address)
+ * when the remote node is loopback - the address remains
+ * the same
+ * when the remote node is a kn* - the base address of the local
+ * card as seen from the remote node is added in
+ */
+ if (!tmp_wnd) {
+ if(ep->remote_dev != &scif_dev[SCIF_HOST_NODE]) {
+ if ((err = map_virt_into_aperture(
+ &window->temp_phys_addr[j],
+ phys_to_virt(page_to_phys(pinned_pages->pages[i])),
+ ep->remote_dev,
+ nr_pages << PAGE_SHIFT))) {
+ int k,l;
+
+ for (l = k = 0; k < i; l++) {
+ nr_pages = pinned_pages->num_pages[k];
+ window->temp_phys_addr[l]
+ &= ~RMA_HUGE_NR_PAGE_MASK;
+ unmap_from_aperture(
+ window->temp_phys_addr[l],
+ ep->remote_dev,
+ nr_pages << PAGE_SHIFT);
+ k += nr_pages;
+ window->temp_phys_addr[l] = 0;
+ }
+ return err;
+ }
+ if (!tmp_wnd)
+ RMA_SET_NR_PAGES(window->temp_phys_addr[j], nr_pages);
+ }
+ }
+#endif
+ window->dma_addr[j] =
+ page_to_phys(pinned_pages->pages[i]);
+ if (!tmp_wnd)
+ RMA_SET_NR_PAGES(window->dma_addr[j], nr_pages);
+#else
+ err = map_virt_into_aperture(&window->dma_addr[j],
+ phys_to_virt(page_to_phys(pinned_pages->pages[i])),
+ ep->remote_dev, nr_pages << PAGE_SHIFT);
+ if (err)
+ return err;
+ if (!tmp_wnd)
+ RMA_SET_NR_PAGES(window->dma_addr[j], nr_pages);
+#endif
+ window->num_pages[j] = nr_pages;
+ }
+ return err;
+}
+
+
+/**
+ * micscif_unregister_window:
+ * @window: self registration window
+ *
+ * Send an unregistration request and wait for a response.
+ */
+int micscif_unregister_window(struct reg_range_t *window)
+{
+ int err = 0;
+ struct endpt *ep = (struct endpt *)window->ep;
+ bool send_msg = false;
+
+ might_sleep();
+ BUG_ON(!mutex_is_locked(&ep->rma_info.rma_lock));
+
+ switch (window->unreg_state) {
+ case OP_IDLE:
+ {
+ window->unreg_state = OP_IN_PROGRESS;
+ send_msg = true;
+ /* fall through */
+ }
+ case OP_IN_PROGRESS:
+ {
+ get_window_ref_count(window, 1);
+ mutex_unlock(&ep->rma_info.rma_lock);
+ if (send_msg && (err = micscif_send_scif_unregister(ep, window))) {
+ window->unreg_state = OP_COMPLETED;
+ goto done;
+ }
+retry:
+ err = wait_event_timeout(window->unregwq,
+ window->unreg_state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT);
+ if (!err && scifdev_alive(ep))
+ goto retry;
+ if (!err) {
+ err = -ENODEV;
+ window->unreg_state = OP_COMPLETED;
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ }
+ if (err > 0)
+ err = 0;
+done:
+ mutex_lock(&ep->rma_info.rma_lock);
+ put_window_ref_count(window, 1);
+ break;
+ }
+ case OP_FAILED:
+ {
+ if (!scifdev_alive(ep)) {
+ err = -ENODEV;
+ window->unreg_state = OP_COMPLETED;
+ }
+ break;
+ }
+ case OP_COMPLETED:
+ break;
+ default:
+ /* Invalid opcode? */
+ BUG_ON(1);
+ }
+
+ if (OP_COMPLETED == window->unreg_state &&
+ window->ref_count)
+ put_window_ref_count(window, window->nr_pages);
+
+ if (!window->ref_count) {
+ atomic_inc(&ep->rma_info.tw_refcount);
+ atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tw_total_pages);
+ list_del(&window->list_member);
+ micscif_free_window_offset(ep, window->offset,
+ window->nr_pages << PAGE_SHIFT);
+ window->offset_freed = true;
+ mutex_unlock(&ep->rma_info.rma_lock);
+ if ((!!(window->pinned_pages->map_flags & SCIF_MAP_KERNEL))
+ && scifdev_alive(ep)) {
+ drain_dma_intr(ep->rma_info.dma_chan);
+ } else {
+ if (!__scif_dec_pinned_vm_lock(window->mm,
+ window->nr_pages, 1)) {
+ __scif_release_mm(window->mm);
+ window->mm = NULL;
+ }
+ }
+ micscif_queue_for_cleanup(window, &ms_info.mi_rma);
+ mutex_lock(&ep->rma_info.rma_lock);
+ }
+ return err;
+}
+
+/**
+ * micscif_send_alloc_request:
+ * @ep: end point
+ * @window: self registration window
+ *
+ * Send a remote window allocation request
+ */
+int micscif_send_alloc_request(struct endpt *ep, struct reg_range_t *window)
+{
+ struct nodemsg msg;
+ struct allocmsg *alloc = &window->alloc_handle;
+
+ /* Set up the Alloc Handle */
+ alloc->uop = SCIF_REGISTER;
+ alloc->state = OP_IN_PROGRESS;
+ init_waitqueue_head(&alloc->allocwq);
+
+ /* Send out an allocation request */
+ msg.uop = SCIF_ALLOC_REQ;
+ msg.src = ep->port;
+ msg.payload[0] = ep->remote_ep;
+ msg.payload[1] = window->nr_pages;
+ msg.payload[2] = (uint64_t)&window->alloc_handle;
+ msg.payload[3] = SCIF_REGISTER;
+ return micscif_nodeqp_send(ep->remote_dev, &msg, ep);
+}
+
+/**
+ * micscif_prep_remote_window:
+ * @ep: end point
+ * @window: self registration window
+ *
+ * Send a remote window allocation request, wait for an allocation response,
+ * prepare the remote window and notify the peer to unmap it once done.
+ */
+int micscif_prep_remote_window(struct endpt *ep, struct reg_range_t *window)
+{
+ struct nodemsg msg;
+ struct reg_range_t *remote_window;
+ struct allocmsg *alloc = &window->alloc_handle;
+ dma_addr_t *dma_phys_lookup, *tmp;
+ int i = 0, j = 0;
+ int nr_contig_chunks, loop_nr_contig_chunks, remaining_nr_contig_chunks, nr_lookup;
+#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
+ dma_addr_t *phys_lookup = 0;
+#endif
+ int err, map_err;
+
+ nr_contig_chunks = remaining_nr_contig_chunks = (int)window->nr_contig_chunks;
+
+ if ((map_err = micscif_map_window_pages(ep, window, false))) {
+ printk(KERN_ERR "%s %d map_err %d\n", __func__, __LINE__, map_err);
+ }
+retry:
+ /* Now wait for the response */
+ err = wait_event_timeout(alloc->allocwq, alloc->state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT);
+ if (!err && scifdev_alive(ep))
+ goto retry;
+
+ if (!err)
+ err = -ENODEV;
+
+ if (err > 0)
+ err = 0;
+ else
+ return err;
+
+ /* Bail out. The remote end rejected this request */
+ if (OP_FAILED == alloc->state)
+ return -ENOMEM;
+
+ if (map_err) {
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, map_err);
+ msg.uop = SCIF_FREE_VIRT;
+ msg.src = ep->port;
+ msg.payload[0] = ep->remote_ep;
+ msg.payload[1] = (uint64_t)window->alloc_handle.vaddr;
+ msg.payload[2] = (uint64_t)window;
+ msg.payload[3] = SCIF_REGISTER;
+ if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
+ err = -ENOTCONN;
+ else
+ err = map_err;
+ return err;
+ }
+
+
+ remote_window = scif_ioremap(alloc->phys_addr,
+ sizeof(*window), ep->remote_dev);
+
+ RMA_MAGIC(remote_window);
+
+ /* Compute the number of lookup entries. 21 == 2MB Shift */
+ nr_lookup = ALIGN(nr_contig_chunks * PAGE_SIZE, ((2) * 1024 * 1024)) >> 21;
+#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
+ if (is_p2p_scifdev(ep->remote_dev))
+ phys_lookup = scif_ioremap(remote_window->temp_phys_addr_lookup.offset,
+ nr_lookup *
+ sizeof(*remote_window->temp_phys_addr_lookup.lookup),
+ ep->remote_dev);
+#endif
+
+ dma_phys_lookup = scif_ioremap(remote_window->dma_addr_lookup.offset,
+ nr_lookup *
+ sizeof(*remote_window->dma_addr_lookup.lookup),
+ ep->remote_dev);
+
+ while (remaining_nr_contig_chunks) {
+ loop_nr_contig_chunks = min(remaining_nr_contig_chunks, (int)NR_PHYS_ADDR_IN_PAGE);
+ /* #1/2 - Copy physical addresses over to the remote side */
+
+#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
+ /* If the remote dev is self or is any node except the host
+ * its OK to copy the bus address to the remote window
+ * in the case of the host (for KNF only) the bus address
+ * is generated at the time of mmap(..) into card memory
+ * and does not exist at this time
+ */
+ /* Note:
+ * the phys_addr[] holds MIC address for remote cards
+ * -> GTT offset for the host (KNF)
+ * -> local address for the host (KNC)
+ * -> local address for loopback
+ * this is done in map_window_pages(..) except for GTT
+ * offset for KNF
+ */
+ if (is_p2p_scifdev(ep->remote_dev)) {
+ tmp = scif_ioremap(phys_lookup[j],
+ loop_nr_contig_chunks * sizeof(*window->temp_phys_addr),
+ ep->remote_dev);
+ memcpy_toio(tmp, &window->temp_phys_addr[i],
+ loop_nr_contig_chunks * sizeof(*window->temp_phys_addr));
+ serializing_request(tmp);
+ smp_mb();
+ scif_iounmap(tmp, PAGE_SIZE, ep->remote_dev);
+ }
+#endif
+ /* #2/2 - Copy DMA addresses (addresses that are fed into the DMA engine)
+ * We transfer bus addresses which are then converted into a MIC physical
+ * address on the remote side if it is a MIC, if the remote node is a host
+ * we transfer the MIC physical address
+ */
+ tmp = scif_ioremap(
+ dma_phys_lookup[j],
+ loop_nr_contig_chunks * sizeof(*window->dma_addr),
+ ep->remote_dev);
+#ifdef _MIC_SCIF_
+ if (is_p2p_scifdev(ep->remote_dev)) {
+ /* knf:
+ * send the address as mapped through the GTT (the remote node's
+ * base address for this node is already added in)
+ * knc:
+ * add remote node's base address for this node to convert it
+ * into a MIC address
+ */
+ int m;
+ dma_addr_t dma_addr;
+ for (m = 0; m < loop_nr_contig_chunks; m++) {
+#ifdef CONFIG_ML1OM
+ dma_addr = window->temp_phys_addr[i + m];
+#else
+ dma_addr = window->dma_addr[i + m] +
+ ep->remote_dev->sd_base_addr;
+#endif
+ writeq(dma_addr, &tmp[m]);
+ }
+ } else
+ /* Host node or loopback - transfer DMA addresses as is, this is
+ * the same as a MIC physical address (we use the dma_addr
+ * and not the phys_addr array since the phys_addr is only setup
+ * if there is a mmap() request from the host)
+ */
+ memcpy_toio(tmp, &window->dma_addr[i],
+ loop_nr_contig_chunks * sizeof(*window->dma_addr));
+#else
+ /* Transfer the physical address array - this is the MIC address
+ * as seen by the card
+ */
+ memcpy_toio(tmp, &window->dma_addr[i],
+ loop_nr_contig_chunks * sizeof(*window->dma_addr));
+#endif
+ remaining_nr_contig_chunks -= loop_nr_contig_chunks;
+ i += loop_nr_contig_chunks;
+ j++;
+ serializing_request(tmp);
+ smp_mb();
+ scif_iounmap(tmp, PAGE_SIZE, ep->remote_dev);
+ }
+
+ /* Prepare the remote window for the peer */
+ remote_window->peer_window = (uint64_t)window;
+ remote_window->offset = window->offset;
+ remote_window->prot = window->prot;
+ remote_window->nr_contig_chunks = nr_contig_chunks;
+#ifdef _MIC_SCIF_
+ if (!ep->rma_info.proxy_dma_peer_phys)
+ ep->rma_info.proxy_dma_peer_phys = remote_window->proxy_dma_phys;
+#endif
+#if defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
+ if (is_p2p_scifdev(ep->remote_dev))
+ scif_iounmap(phys_lookup,
+ nr_lookup *
+ sizeof(*remote_window->temp_phys_addr_lookup.lookup),
+ ep->remote_dev);
+#endif
+ scif_iounmap(dma_phys_lookup,
+ nr_lookup *
+ sizeof(*remote_window->dma_addr_lookup.lookup),
+ ep->remote_dev);
+ scif_iounmap(remote_window, sizeof(*remote_window), ep->remote_dev);
+ window->peer_window = (uint64_t)alloc->vaddr;
+ return err;
+}
+
+/**
+ * micscif_send_scif_register:
+ * @ep: end point
+ * @window: self registration window
+ *
+ * Send a SCIF_REGISTER message if EP is connected and wait for a
+ * SCIF_REGISTER_(N)ACK message else send a SCIF_FREE_VIRT
+ * message so that the peer can free its remote window allocated earlier.
+ */
+int micscif_send_scif_register(struct endpt *ep, struct reg_range_t *window)
+{
+ int err = 0;
+ struct nodemsg msg;
+
+ msg.src = ep->port;
+ msg.payload[0] = ep->remote_ep;
+ msg.payload[1] = (uint64_t)window->alloc_handle.vaddr;
+ msg.payload[2] = (uint64_t)window;
+ if (SCIFEP_CONNECTED == ep->state) {
+ msg.uop = SCIF_REGISTER;
+ window->reg_state = OP_IN_PROGRESS;
+ if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep))) {
+ micscif_set_nr_pages(ep->remote_dev, window);
+retry:
+ err = wait_event_timeout(window->regwq,
+ window->reg_state != OP_IN_PROGRESS, NODE_ALIVE_TIMEOUT);
+ if (!err && scifdev_alive(ep))
+ goto retry;
+ if (!err)
+ err = -ENODEV;
+ if (err > 0)
+ err = 0;
+ if (OP_FAILED == window->reg_state)
+ err = -ENOTCONN;
+ } else {
+ micscif_set_nr_pages(ep->remote_dev, window);
+ }
+ } else {
+ msg.uop = SCIF_FREE_VIRT;
+ msg.payload[3] = SCIF_REGISTER;
+ if (!(err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
+ err = -ENOTCONN;
+ micscif_set_nr_pages(ep->remote_dev, window);
+ }
+ return err;
+}
+
+/**
+ * micscif_send_scif_unregister:
+ * @ep: end point
+ * @window: self registration window
+ *
+ * Send a SCIF_UNREGISTER message.
+ */
+int micscif_send_scif_unregister(struct endpt *ep, struct reg_range_t *window)
+{
+ struct nodemsg msg;
+
+ RMA_MAGIC(window);
+ msg.uop = SCIF_UNREGISTER;
+ msg.src = ep->port;
+ msg.payload[0] = (uint64_t)window->alloc_handle.vaddr;
+ msg.payload[1] = (uint64_t)window;
+ return micscif_nodeqp_send(ep->remote_dev, &msg, ep);
+}
+
+/**
+ * micscif_get_window_offset:
+ * @epd: end point descriptor
+ * @flags: flags
+ * @offset: offset hint
+ * @len: length of range
+ * @out_offset: computed offset returned by reference.
+ *
+ * Compute/Claim a new offset for this EP. The callee is supposed to grab
+ * the RMA mutex before calling this API.
+ */
+int micscif_get_window_offset(struct endpt *ep, int flags,
+ uint64_t offset, size_t len, uint64_t *out_offset)
+{
+ uint64_t computed_offset;
+ int err = 0;
+
+ might_sleep();
+ mutex_lock(&ep->rma_info.va_lock);
+ if (flags & SCIF_MAP_FIXED) {
+ computed_offset = va_gen_claim(&ep->rma_info.va_gen,
+ (uint64_t)offset, len);
+ if (INVALID_VA_GEN_ADDRESS == computed_offset)
+ err = -EADDRINUSE;
+ } else {
+ computed_offset = va_gen_alloc(&ep->rma_info.va_gen,
+ len, PAGE_SIZE);
+ if (INVALID_VA_GEN_ADDRESS == computed_offset)
+ err = -ENOMEM;
+ }
+ *out_offset = computed_offset;
+ mutex_unlock(&ep->rma_info.va_lock);
+ return err;
+}
+
+/**
+ * micscif_free_window_offset:
+ * @offset: offset hint
+ * @len: length of range
+ *
+ * Free offset for this EP. The callee is supposed to grab
+ * the RMA mutex before calling this API.
+ */
+void micscif_free_window_offset(struct endpt *ep,
+ uint64_t offset, size_t len)
+{
+ mutex_lock(&ep->rma_info.va_lock);
+ va_gen_free(&ep->rma_info.va_gen, offset, len);
+ mutex_unlock(&ep->rma_info.va_lock);
+}
+
+/**
+ * scif_register_temp:
+ * @epd: End Point Descriptor.
+ * @addr: virtual address to/from which to copy
+ * @len: length of range to copy
+ * @out_offset: computed offset returned by reference.
+ * @out_window: allocated registered window returned by reference.
+ *
+ * Create a temporary registered window. The peer will not know about this
+ * window. This API is used for scif_vreadfrom()/scif_vwriteto() API's.
+ */
+static int
+micscif_register_temp(scif_epd_t epd, void *addr, size_t len, int prot,
+ off_t *out_offset, struct reg_range_t **out_window)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ int err;
+ scif_pinned_pages_t pinned_pages;
+ size_t aligned_len;
+
+ aligned_len = ALIGN(len, PAGE_SIZE);
+
+ if ((err = __scif_pin_pages((void *)((uint64_t)addr &
+ PAGE_MASK),
+ aligned_len, &prot, 0, &pinned_pages)))
+ return err;
+
+ pinned_pages->prot = prot;
+
+ /* Compute the offset for this registration */
+ if ((err = micscif_get_window_offset(ep, 0, 0,
+ aligned_len, (uint64_t *)out_offset)))
+ goto error_unpin;
+
+ /* Allocate and prepare self registration window */
+ if (!(*out_window = micscif_create_window(ep, aligned_len >> PAGE_SHIFT,
+ *out_offset, true))) {
+ micscif_free_window_offset(ep, *out_offset, aligned_len);
+ err = -ENOMEM;
+ goto error_unpin;
+ }
+
+ (*out_window)->pinned_pages = pinned_pages;
+ (*out_window)->nr_pages = pinned_pages->nr_pages;
+ (*out_window)->nr_contig_chunks = pinned_pages->nr_contig_chunks;
+ (*out_window)->prot = pinned_pages->prot;
+
+ (*out_window)->va_for_temp = (void*)((uint64_t)addr & PAGE_MASK);
+ if ((err = micscif_map_window_pages(ep, *out_window, true))) {
+ /* Something went wrong! Rollback */
+ micscif_destroy_window(ep, *out_window);
+ *out_window = NULL;
+ } else
+ *out_offset |= ((uint64_t)addr & ~PAGE_MASK);
+
+ return err;
+error_unpin:
+ if (err)
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ scif_unpin_pages(pinned_pages);
+ return err;
+}
+
+/**
+ * micscif_rma_completion_cb:
+ * @data: RMA cookie
+ *
+ * RMA interrupt completion callback.
+ */
+void micscif_rma_completion_cb(uint64_t data)
+{
+ struct dma_completion_cb *comp_cb = (struct dma_completion_cb *)data;
+#ifndef _MIC_SCIF_
+ struct pci_dev *pdev;
+#endif
+
+ /* Free DMA Completion CB. */
+ if (comp_cb && comp_cb->temp_buf) {
+ if (comp_cb->dst_window) {
+ micscif_rma_local_cpu_copy(comp_cb->dst_offset,
+ comp_cb->dst_window, comp_cb->temp_buf + comp_cb->header_padding,
+ comp_cb->len, false);
+ }
+#ifndef _MIC_SCIF_
+ micscif_pci_dev(comp_cb->remote_node, &pdev);
+ mic_ctx_unmap_single(get_per_dev_ctx(comp_cb->remote_node - 1),
+ comp_cb->temp_phys, KMEM_UNALIGNED_BUF_SIZE);
+#endif
+ if (comp_cb->is_cache)
+ micscif_kmem_cache_free(comp_cb->temp_buf_to_free);
+ else
+ kfree(comp_cb->temp_buf_to_free);
+ }
+ kfree(comp_cb);
+}
+
+static void __micscif_rma_destroy_tcw_ep(struct endpt *ep);
+static
+bool micscif_rma_tc_can_cache(struct endpt *ep, size_t cur_bytes)
+{
+ if ((cur_bytes >> PAGE_SHIFT) > ms_info.mi_rma_tc_limit)
+ return false;
+ if ((atomic_read(&ep->rma_info.tcw_total_pages)
+ + (cur_bytes >> PAGE_SHIFT)) >
+ ms_info.mi_rma_tc_limit) {
+ printk(KERN_ALERT "%s %d total=%d, current=%zu reached max\n",
+ __func__, __LINE__,
+ atomic_read(&ep->rma_info.tcw_total_pages),
+ (1 + (cur_bytes >> PAGE_SHIFT)));
+ micscif_rma_destroy_tcw_invalid(&ms_info.mi_rma_tc);
+ __micscif_rma_destroy_tcw_ep(ep);
+ }
+ return true;
+}
+
+/**
+ * micscif_rma_copy:
+ * @epd: end point descriptor.
+ * @loffset: offset in local registered address space to/from which to copy
+ * @addr: user virtual address to/from which to copy
+ * @len: length of range to copy
+ * @roffset: offset in remote registered address space to/from which to copy
+ * @flags: flags
+ * @dir: LOCAL->REMOTE or vice versa.
+ *
+ * Validate parameters, check if src/dst registered ranges requested for copy
+ * are valid and initiate either CPU or DMA copy.
+ */
+int micscif_rma_copy(scif_epd_t epd, off_t loffset, void *addr, size_t len,
+ off_t roffset, int flags, enum rma_direction dir, bool last_chunk)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ struct micscif_rma_req remote_req;
+ struct micscif_rma_req req;
+ struct reg_range_t *window = NULL;
+ struct reg_range_t *remote_window = NULL;
+ struct mic_copy_work copy_work;
+ bool loopback;
+ int err = 0;
+ struct dma_channel *chan;
+ struct rma_mmu_notifier *mmn = NULL;
+ bool insert_window = false;
+ bool cache = false;
+
+ if ((err = verify_epd(ep)))
+ return err;
+
+ if (flags && !(flags & (SCIF_RMA_USECPU | SCIF_RMA_USECACHE | SCIF_RMA_SYNC | SCIF_RMA_ORDERED)))
+ return -EINVAL;
+
+ if (!len)
+ return -EINVAL;
+ loopback = is_self_scifdev(ep->remote_dev) ? true : false;
+ copy_work.fence_type = ((flags & SCIF_RMA_SYNC) && last_chunk) ? DO_DMA_POLLING : 0;
+ copy_work.ordered = !!((flags & SCIF_RMA_ORDERED) && last_chunk);
+
+#ifdef CONFIG_MMU_NOTIFIER
+ if (!mic_reg_cache_enable)
+ flags &= ~SCIF_RMA_USECACHE;
+#else
+ flags &= ~SCIF_RMA_USECACHE;
+#endif
+#ifndef _MIC_SCIF_
+#ifdef CONFIG_ML1OM
+ /* Use DMA Copies even if CPU copy is requested on KNF MIC from Host */
+ if (flags & SCIF_RMA_USECPU) {
+ flags &= ~SCIF_RMA_USECPU;
+ if (last_chunk)
+ copy_work.fence_type = DO_DMA_POLLING;
+ }
+#endif
+ /* Use CPU for Host<->Host Copies */
+ if (loopback) {
+ flags |= SCIF_RMA_USECPU;
+ copy_work.fence_type = 0x0;
+ }
+#endif
+
+ cache = flags & SCIF_RMA_USECACHE;
+
+ /* Trying to wrap around */
+ if ((loffset && (loffset + (off_t)len < loffset)) ||
+ (roffset + (off_t)len < roffset))
+ return -EINVAL;
+
+ remote_req.out_window = &remote_window;
+ remote_req.offset = roffset;
+ remote_req.nr_bytes = len;
+ /*
+ * If transfer is from local to remote then the remote window
+ * must be writeable and vice versa.
+ */
+ remote_req.prot = LOCAL_TO_REMOTE == dir ? VM_WRITE : VM_READ;
+ remote_req.type = WINDOW_PARTIAL;
+ remote_req.head = &ep->rma_info.remote_reg_list;
+
+#ifdef CONFIG_MMU_NOTIFIER
+ if (addr && cache) {
+ mutex_lock(&ep->rma_info.mmn_lock);
+ mmn = find_mmu_notifier(current->mm, &ep->rma_info);
+ if (!mmn) {
+ mmn = kzalloc(sizeof(*mmn), GFP_KERNEL);
+ if (!mmn) {
+ mutex_unlock(&ep->rma_info.mmn_lock);
+ return -ENOMEM;
+ }
+ init_mmu_notifier(mmn, current->mm, ep);
+ if (mmu_notifier_register(&mmn->ep_mmu_notifier, current->mm)) {
+ mutex_unlock(&ep->rma_info.mmn_lock);
+ kfree(mmn);
+ return -EBUSY;
+ }
+#ifdef RMA_DEBUG
+ atomic_long_add_return(1, &ms_info.mmu_notif_cnt);
+#endif
+ list_add(&mmn->list_member, &ep->rma_info.mmn_list);
+ }
+ mutex_unlock(&ep->rma_info.mmn_lock);
+ }
+#endif
+
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+#ifdef _MIC_SCIF_
+ if (!(flags & SCIF_RMA_USECPU)) {
+ /*
+ * Proxy the DMA only for P2P reads with transfer size
+ * greater than proxy DMA threshold. scif_vreadfrom(..)
+ * and scif_vwriteto(..) is not supported since the peer
+ * does not have the page lists required to perform the
+ * proxy DMA.
+ */
+ if (ep->remote_dev->sd_proxy_dma_reads &&
+ !addr && dir == REMOTE_TO_LOCAL &&
+ ep->rma_info.proxy_dma_va &&
+ len >= ms_info.mi_proxy_dma_threshold) {
+ copy_work.len = len;
+ copy_work.src_offset = roffset;
+ copy_work.dst_offset = loffset;
+ /* Fall through if there were errors */
+ if (!(err = micscif_proxy_dma(epd, ©_work)))
+ goto error;
+ }
+ }
+#endif
+ mutex_lock(&ep->rma_info.rma_lock);
+ if (addr) {
+ req.out_window = &window;
+ req.nr_bytes = ALIGN(len + ((uint64_t)addr & ~PAGE_MASK), PAGE_SIZE);
+ if (mmn)
+ req.head = &mmn->tc_reg_list;
+ req.va_for_temp = (void*)((uint64_t)addr & PAGE_MASK);
+ req.prot = (LOCAL_TO_REMOTE == dir ? VM_READ : VM_WRITE | VM_READ);
+ /* Does a valid local window exist? */
+
+ pr_debug("%s %d req.va_for_temp %p addr %p req.nr_bytes 0x%lx len 0x%lx\n",
+ __func__, __LINE__, req.va_for_temp, addr, req.nr_bytes, len);
+ spin_lock(&ep->rma_info.tc_lock);
+ if (!mmn || (err = micscif_query_tcw(ep, &req))) {
+ pr_debug("%s %d err %d req.va_for_temp %p addr %p req.nr_bytes 0x%lx len 0x%lx\n",
+ __func__, __LINE__, err, req.va_for_temp, addr, req.nr_bytes, len);
+ spin_unlock(&ep->rma_info.tc_lock);
+ mutex_unlock(&ep->rma_info.rma_lock);
+ if (cache)
+ if (!micscif_rma_tc_can_cache(ep, req.nr_bytes))
+ cache = false;
+ if ((err = micscif_register_temp(epd, req.va_for_temp, req.nr_bytes,
+ req.prot,
+ &loffset, &window))) {
+ goto error;
+ }
+ mutex_lock(&ep->rma_info.rma_lock);
+ pr_debug("New temp window created addr %p\n", addr);
+ if (cache) {
+ atomic_inc(&ep->rma_info.tcw_refcount);
+ atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tcw_total_pages);
+ if (mmn) {
+ spin_lock(&ep->rma_info.tc_lock);
+ micscif_insert_tcw(window, &mmn->tc_reg_list);
+ spin_unlock(&ep->rma_info.tc_lock);
+ }
+ }
+ insert_window = true;
+ } else {
+ spin_unlock(&ep->rma_info.tc_lock);
+ pr_debug("window found for addr %p\n", addr);
+ BUG_ON(window->va_for_temp > addr);
+ }
+ loffset = window->offset + ((uint64_t)addr - (uint64_t)window->va_for_temp);
+ pr_debug("%s %d addr %p loffset 0x%lx window->nr_pages 0x%llx"
+ " window->va_for_temp %p\n", __func__, __LINE__,
+ addr, loffset, window->nr_pages, window->va_for_temp);
+ RMA_MAGIC(window);
+ }
+
+ /* Does a valid remote window exist? */
+ if ((err = micscif_query_window(&remote_req))) {
+ pr_debug("%s %d err %d roffset 0x%lx len 0x%lx\n",
+ __func__, __LINE__, err, roffset, len);
+ mutex_unlock(&ep->rma_info.rma_lock);
+ goto error;
+ }
+ RMA_MAGIC(remote_window);
+ if (!addr) {
+ req.out_window = &window;
+ req.offset = loffset;
+ /*
+ * If transfer is from local to remote then the self window
+ * must be readable and vice versa.
+ */
+ req.prot = LOCAL_TO_REMOTE == dir ? VM_READ : VM_WRITE;
+ req.nr_bytes = len;
+ req.type = WINDOW_PARTIAL;
+ req.head = &ep->rma_info.reg_list;
+ /* Does a valid local window exist? */
+ if ((err = micscif_query_window(&req))) {
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ mutex_unlock(&ep->rma_info.rma_lock);
+ goto error;
+ }
+ RMA_MAGIC(window);
+ }
+
+ /*
+ * Preprare copy_work for submitting work to the DMA kernel thread
+ * or CPU copy routine.
+ */
+ copy_work.len = len;
+ copy_work.loopback = loopback;
+ copy_work.remote_dev = ep->remote_dev;
+ copy_work.dma_chan_released = false;
+ if (LOCAL_TO_REMOTE == dir) {
+ copy_work.src_offset = loffset;
+ copy_work.src_window = window;
+ copy_work.dst_offset = roffset;
+ copy_work.dst_window = remote_window;
+ } else {
+ copy_work.src_offset = roffset;
+ copy_work.src_window = remote_window;
+ copy_work.dst_offset = loffset;
+ copy_work.dst_window = window;
+ }
+
+ if (!(flags & SCIF_RMA_USECPU)) {
+ chan = ep->rma_info.dma_chan;
+ if ((err = request_dma_channel(chan))) {
+ mutex_unlock(&ep->rma_info.rma_lock);
+ goto error;
+ }
+ err = micscif_rma_list_dma_copy_wrapper(epd, ©_work,
+ chan, loffset);
+ if (!copy_work.dma_chan_released)
+ free_dma_channel(chan);
+ }
+ if (flags & SCIF_RMA_USECPU) {
+ /* Initiate synchronous CPU copy */
+ micscif_rma_list_cpu_copy(©_work);
+ }
+ if (insert_window && !cache) {
+ atomic_inc(&ep->rma_info.tw_refcount);
+ atomic_add_return((int32_t)window->nr_pages, &ep->rma_info.tw_total_pages);
+ }
+
+ mutex_unlock(&ep->rma_info.rma_lock);
+
+ if (last_chunk) {
+ if (DO_DMA_POLLING == copy_work.fence_type)
+ err = drain_dma_poll(ep->rma_info.dma_chan);
+ else if (DO_DMA_INTR == copy_work.fence_type)
+ err = drain_dma_intr(ep->rma_info.dma_chan);
+ }
+
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ if (insert_window && !cache)
+ micscif_queue_for_cleanup(window, &ms_info.mi_rma);
+ return err;
+error:
+ if (err) {
+ if (addr && window && !cache)
+ micscif_destroy_window(ep, window);
+ printk(KERN_ERR "%s %d err %d len 0x%lx\n", __func__, __LINE__, err, len);
+ }
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ return err;
+}
+
+/**
+ * micscif_send_fence_mark:
+ * @epd: end point descriptor.
+ * @out_mark: Output DMA mark reported by peer.
+ *
+ * Send a remote fence mark request.
+ */
+int micscif_send_fence_mark(scif_epd_t epd, int *out_mark)
+{
+ int err;
+ struct nodemsg msg;
+ struct fence_info *fence_req;
+ struct endpt *ep = (struct endpt *)epd;
+
+ if (!(fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL))) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ fence_req->state = OP_IN_PROGRESS;
+ init_waitqueue_head(&fence_req->wq);
+
+ msg.src = ep->port;
+ msg.uop = SCIF_MARK;
+ msg.payload[0] = ep->remote_ep;
+ msg.payload[1] = (uint64_t)fence_req;
+
+ if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
+ goto error;
+
+retry:
+ err = wait_event_timeout(fence_req->wq,
+ (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT);
+ if (!err && scifdev_alive(ep))
+ goto retry;
+ if (!err)
+ err = -ENODEV;
+ if (err > 0)
+ err = 0;
+ if (err < 0) {
+ mutex_lock(&ep->rma_info.rma_lock);
+ if (OP_IN_PROGRESS == fence_req->state)
+ fence_req->state = OP_FAILED;
+ mutex_unlock(&ep->rma_info.rma_lock);
+ }
+ if (OP_COMPLETED == fence_req->state)
+ *out_mark = SCIF_REMOTE_FENCE | fence_req->dma_mark;
+
+ if (OP_FAILED == fence_req->state && !err)
+ err = -ENOMEM;
+ mutex_lock(&ep->rma_info.rma_lock);
+ mutex_unlock(&ep->rma_info.rma_lock);
+ kfree(fence_req);
+error:
+ return err;
+}
+
+/**
+ * micscif_send_fence_wait:
+ * @epd: end point descriptor.
+ * @mark: DMA mark to wait for.
+ *
+ * Send a remote fence wait request.
+ */
+int micscif_send_fence_wait(scif_epd_t epd, int mark)
+{
+ int err;
+ struct nodemsg msg;
+ struct fence_info *fence_req;
+ struct endpt *ep = (struct endpt *)epd;
+
+ if (!(fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL))) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ fence_req->state = OP_IN_PROGRESS;
+ init_waitqueue_head(&fence_req->wq);
+
+ msg.src = ep->port;
+ msg.uop = SCIF_WAIT;
+ msg.payload[0] = ep->remote_ep;
+ msg.payload[1] = (uint64_t)fence_req;
+ msg.payload[2] = mark;
+
+ if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
+ goto error;
+retry:
+ err = wait_event_timeout(fence_req->wq,
+ (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT);
+ if (!err && scifdev_alive(ep))
+ goto retry;
+ if (!err)
+ err = -ENODEV;
+ if (err > 0)
+ err = 0;
+ if (err < 0) {
+ mutex_lock(&ep->rma_info.rma_lock);
+ if (OP_IN_PROGRESS == fence_req->state)
+ fence_req->state = OP_FAILED;
+ mutex_unlock(&ep->rma_info.rma_lock);
+ }
+ if (OP_FAILED == fence_req->state && !err)
+ err = -ENOMEM;
+ mutex_lock(&ep->rma_info.rma_lock);
+ mutex_unlock(&ep->rma_info.rma_lock);
+ kfree(fence_req);
+error:
+ return err;
+}
+
+/**
+ * micscif_send_fence_signal:
+ * @epd - endpoint descriptor
+ * @loff - local offset
+ * @lval - local value to write to loffset
+ * @roff - remote offset
+ * @rval - remote value to write to roffset
+ * @flags - flags
+ *
+ * Sends a remote fence signal request
+ */
+int micscif_send_fence_signal(scif_epd_t epd, off_t roff, uint64_t rval,
+ off_t loff, uint64_t lval, int flags)
+{
+ int err = 0;
+ struct nodemsg msg;
+ struct fence_info *fence_req;
+ struct endpt *ep = (struct endpt *)epd;
+
+ if (!(fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL))) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ fence_req->state = OP_IN_PROGRESS;
+ init_waitqueue_head(&fence_req->wq);
+
+ msg.src = ep->port;
+ if (flags & SCIF_SIGNAL_LOCAL) {
+ msg.uop = SCIF_SIG_LOCAL;
+ msg.payload[0] = ep->remote_ep;
+ msg.payload[1] = roff;
+ msg.payload[2] = rval;
+ msg.payload[3] = (uint64_t)fence_req;
+ if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
+ goto error_free;
+retry1:
+ err = wait_event_timeout(fence_req->wq,
+ (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT);
+ if (!err && scifdev_alive(ep))
+ goto retry1;
+ if (!err)
+ err = -ENODEV;
+ if (err > 0)
+ err = 0;
+ if (err < 0) {
+ mutex_lock(&ep->rma_info.rma_lock);
+ if (OP_IN_PROGRESS == fence_req->state)
+ fence_req->state = OP_FAILED;
+ mutex_unlock(&ep->rma_info.rma_lock);
+ }
+ if (OP_FAILED == fence_req->state && !err) {
+ err = -ENXIO;
+ goto error_free;
+ }
+ }
+ fence_req->state = OP_IN_PROGRESS;
+
+ if (flags & SCIF_SIGNAL_REMOTE) {
+ msg.uop = SCIF_SIG_REMOTE;
+ msg.payload[0] = ep->remote_ep;
+ msg.payload[1] = loff;
+ msg.payload[2] = lval;
+ msg.payload[3] = (uint64_t)fence_req;
+ if ((err = micscif_nodeqp_send(ep->remote_dev, &msg, ep)))
+ goto error_free;
+retry2:
+ err = wait_event_timeout(fence_req->wq,
+ (OP_IN_PROGRESS != fence_req->state), NODE_ALIVE_TIMEOUT);
+ if (!err && scifdev_alive(ep))
+ goto retry2;
+ if (!err)
+ err = -ENODEV;
+ if (err > 0)
+ err = 0;
+ if (err < 0) {
+ mutex_lock(&ep->rma_info.rma_lock);
+ if (OP_IN_PROGRESS == fence_req->state)
+ fence_req->state = OP_FAILED;
+ mutex_unlock(&ep->rma_info.rma_lock);
+ }
+ if (OP_FAILED == fence_req->state && !err) {
+ err = -ENXIO;
+ goto error_free;
+ }
+ }
+error_free:
+ mutex_lock(&ep->rma_info.rma_lock);
+ mutex_unlock(&ep->rma_info.rma_lock);
+ kfree(fence_req);
+error:
+ return err;
+}
+
+/*
+ * micscif_fence_mark:
+ *
+ * @epd - endpoint descriptor
+ * Set up a mark for this endpoint and return the value of the mark.
+ */
+int micscif_fence_mark(scif_epd_t epd)
+{
+ int mark = 0;
+ struct endpt *ep = (struct endpt *)epd;
+ struct dma_channel *chan = ep->rma_info.dma_chan;
+
+ if ((mark = request_dma_channel(chan)))
+ goto error;
+
+ mark = program_dma_mark(chan);
+
+ free_dma_channel(chan);
+error:
+ return mark;
+}
+
+/**
+ * micscif_rma_destroy_temp_windows:
+ *
+ * This routine destroys temporary registered windows created
+ * by scif_vreadfrom() and scif_vwriteto().
+ */
+void micscif_rma_destroy_temp_windows(void)
+{
+ struct list_head *item, *tmp;
+ struct reg_range_t *window;
+ struct endpt *ep;
+ struct dma_channel *chan;
+ might_sleep();
+restart:
+ spin_lock(&ms_info.mi_rmalock);
+ list_for_each_safe(item, tmp, &ms_info.mi_rma) {
+ window = list_entry(item,
+ struct reg_range_t, list_member);
+ ep = (struct endpt *)window->ep;
+ chan = ep->rma_info.dma_chan;
+
+ list_del(&window->list_member);
+ spin_unlock(&ms_info.mi_rmalock);
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ if (!chan ||
+ !scifdev_alive(ep) ||
+ (!is_current_dma_mark(chan, window->dma_mark) &&
+ is_dma_mark_processed(chan, window->dma_mark)) ||
+ !drain_dma_intr(chan)) {
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ /* Remove window from global list */
+ window->unreg_state = OP_COMPLETED;
+ } else {
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ /* DMA engine hung ?? */
+ printk(KERN_ERR "%s %d DMA channel %d hung ep->state %d "
+ "window->dma_mark 0x%x channel_mark 0x%x\n",
+ __func__, __LINE__, get_chan_num(chan),
+ ep->sd_state, window->dma_mark, get_dma_mark(chan));
+ WARN_ON(1);
+ micscif_queue_for_cleanup(window, &ms_info.mi_rma);
+ goto restart;
+ }
+
+ if (OP_COMPLETED == window->unreg_state) {
+ BUG_ON(atomic_sub_return((int32_t)window->nr_pages,
+ &ep->rma_info.tw_total_pages) < 0);
+ if (RMA_WINDOW_SELF == window->type)
+ micscif_destroy_window(ep, window);
+ else
+ micscif_destroy_remote_window(ep, window);
+ BUG_ON(atomic_dec_return(
+ &ep->rma_info.tw_refcount) < 0);
+ }
+ goto restart;
+ }
+ spin_unlock(&ms_info.mi_rmalock);
+}
+
+/**
+ * micscif_rma_destroy_tcw:
+ *
+ * This routine destroys temporary registered windows created
+ * by scif_vreadfrom() and scif_vwriteto().
+ */
+static
+void __micscif_rma_destroy_tcw(struct rma_mmu_notifier *mmn,
+ struct endpt *ep, bool inrange,
+ uint64_t start, uint64_t len)
+{
+ struct list_head *item, *tmp;
+ struct reg_range_t *window;
+ uint64_t start_va, end_va;
+ uint64_t end = start + len;
+ list_for_each_safe(item, tmp, &mmn->tc_reg_list) {
+ window = list_entry(item,
+ struct reg_range_t, list_member);
+ ep = (struct endpt *)window->ep;
+ if (inrange) {
+ if (0 == len)
+ break;
+ start_va = (uint64_t)window->va_for_temp;
+ end_va = start_va+ (window->nr_pages << PAGE_SHIFT);
+ if (start < start_va) {
+ if (end <= start_va) {
+ break;
+ } else {
+ }
+
+ } else {
+ if (start >= end_va) {
+ continue;
+ } else {
+ }
+ }
+ }
+ __micscif_rma_destroy_tcw_helper(window);
+ }
+}
+
+static inline
+void micscif_rma_destroy_tcw(struct rma_mmu_notifier *mmn,
+ struct endpt *ep, bool inrange,
+ uint64_t start, uint64_t len)
+{
+ unsigned long sflags;
+
+ spin_lock_irqsave(&ep->rma_info.tc_lock, sflags);
+ __micscif_rma_destroy_tcw(mmn, ep, inrange, start, len);
+ spin_unlock_irqrestore(&ep->rma_info.tc_lock, sflags);
+}
+
+static void __micscif_rma_destroy_tcw_ep(struct endpt *ep)
+{
+ struct list_head *item, *tmp;
+ struct rma_mmu_notifier *mmn;
+ spin_lock(&ep->rma_info.tc_lock);
+ list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) {
+ mmn = list_entry(item,
+ struct rma_mmu_notifier, list_member);
+ __micscif_rma_destroy_tcw(mmn, ep, false, 0, 0);
+ }
+ spin_unlock(&ep->rma_info.tc_lock);
+}
+
+void micscif_rma_destroy_tcw_ep(struct endpt *ep)
+{
+ struct list_head *item, *tmp;
+ struct rma_mmu_notifier *mmn;
+ list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) {
+ mmn = list_entry(item,
+ struct rma_mmu_notifier, list_member);
+ micscif_rma_destroy_tcw(mmn, ep, false, 0, 0);
+ }
+}
+
+/**
+ * micscif_rma_destroy_tcw:
+ *
+ * This routine destroys temporary registered windows created
+ * by scif_vreadfrom() and scif_vwriteto().
+ */
+void micscif_rma_destroy_tcw_invalid(struct list_head *list)
+{
+ struct list_head *item, *tmp;
+ struct reg_range_t *window;
+ struct endpt *ep;
+ struct dma_channel *chan;
+ might_sleep();
+restart:
+ spin_lock(&ms_info.mi_rmalock);
+ list_for_each_safe(item, tmp, list) {
+ window = list_entry(item,
+ struct reg_range_t, list_member);
+ ep = (struct endpt *)window->ep;
+ chan = ep->rma_info.dma_chan;
+ list_del(&window->list_member);
+ spin_unlock(&ms_info.mi_rmalock);
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ mutex_lock(&ep->rma_info.rma_lock);
+ if (!chan ||
+ !scifdev_alive(ep) ||
+ (!is_current_dma_mark(chan, window->dma_mark) &&
+ is_dma_mark_processed(chan, window->dma_mark)) ||
+ !drain_dma_intr(chan)) {
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ BUG_ON(atomic_sub_return((int32_t)window->nr_pages,
+ &ep->rma_info.tcw_total_pages) < 0);
+ micscif_destroy_window(ep, window);
+ BUG_ON(atomic_dec_return(
+ &ep->rma_info.tcw_refcount) < 0);
+ } else {
+ /* DMA engine hung ?? */
+ printk(KERN_ERR "%s %d DMA channel %d hung ep->state %d "
+ "window->dma_mark 0x%x channel_mark 0x%x\n",
+ __func__, __LINE__, get_chan_num(chan),
+ ep->sd_state, window->dma_mark, get_dma_mark(chan));
+ WARN_ON(1);
+ mutex_unlock(&ep->rma_info.rma_lock);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ micscif_queue_for_cleanup(window, &ms_info.mi_rma);
+ goto restart;
+ }
+ mutex_unlock(&ep->rma_info.rma_lock);
+ goto restart;
+ }
+ spin_unlock(&ms_info.mi_rmalock);
+}
+
+/**
+ * micscif_rma_handle_remote_fences:
+ *
+ * This routine services remote fence requests.
+ */
+void micscif_rma_handle_remote_fences(void)
+{
+ struct list_head *item, *tmp;
+ struct remote_fence_info *fence;
+ struct endpt *ep;
+ int mark;
+
+ might_sleep();
+ mutex_lock(&ms_info.mi_fencelock);
+ list_for_each_safe(item, tmp, &ms_info.mi_fence) {
+ fence = list_entry(item,
+ struct remote_fence_info, list_member);
+ /* Remove fence from global list */
+ list_del(&fence->list_member);
+
+ /* Initiate the fence operation */
+ ep = (struct endpt *)fence->msg.payload[0];
+ mark = (int)fence->msg.payload[2];
+ BUG_ON(!(mark & SCIF_REMOTE_FENCE));
+ if (dma_mark_wait(ep->rma_info.dma_chan,
+ mark & ~SCIF_REMOTE_FENCE, false)) {
+ printk(KERN_ERR "%s %d err\n", __func__, __LINE__);
+ fence->msg.uop = SCIF_WAIT_NACK;
+ } else {
+ fence->msg.uop = SCIF_WAIT_ACK;
+ }
+ micscif_inc_node_refcnt(ep->remote_dev, 1);
+ fence->msg.payload[0] = ep->remote_ep;
+ /* No error handling for Notification messages. */
+ micscif_nodeqp_send(ep->remote_dev, &fence->msg, ep);
+ micscif_dec_node_refcnt(ep->remote_dev, 1);
+ kfree(fence);
+ /*
+ * Decrement ref count and wake up
+ * any thread blocked in the EP close routine waiting
+ * for all such remote fence requests to complete.
+ */
+ ep->rma_info.fence_refcount--;
+ wake_up(&ep->rma_info.fence_wq);
+ }
+ mutex_unlock(&ms_info.mi_fencelock);
+}
+
+#ifdef CONFIG_MMU_NOTIFIER
+void micscif_mmu_notif_handler(struct work_struct *work)
+{
+ struct list_head *pos, *tmpq;
+ struct endpt *ep;
+restart:
+ micscif_rma_destroy_tcw_invalid(&ms_info.mi_rma_tc);
+ spin_lock(&ms_info.mi_rmalock);
+ list_for_each_safe(pos, tmpq, &ms_info.mi_mmu_notif_cleanup) {
+ ep = list_entry(pos, struct endpt, mmu_list);
+ list_del(&ep->mmu_list);
+ spin_unlock(&ms_info.mi_rmalock);
+ BUG_ON(list_empty(&ep->rma_info.mmn_list));
+
+ micscif_rma_destroy_tcw_ep(ep);
+ ep_unregister_mmu_notifier(ep);
+ queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work);
+ goto restart;
+ }
+ spin_unlock(&ms_info.mi_rmalock);
+}
+#endif
+
+/**
+ * micscif_reserve_dma_chan:
+ * @ep: Endpoint Descriptor.
+ *
+ * This routine reserves a DMA channel for a particular
+ * endpoint. All DMA transfers for an endpoint are always
+ * programmed on the same DMA channel.
+ */
+int micscif_reserve_dma_chan(struct endpt *ep)
+{
+ int err = 0;
+#ifndef _MIC_SCIF_
+ /*
+ * Host Loopback cannot use DMA by design and hence
+ * reserving DMA channels is a nop.
+ */
+ if (is_self_scifdev(ep->remote_dev))
+ return 0;
+#endif
+ mutex_lock(&ep->rma_info.rma_lock);
+ if (!ep->rma_info.dma_chan) {
+ struct dma_channel **chan = &ep->rma_info.dma_chan;
+ unsigned long ts = jiffies;
+#ifndef _MIC_SCIF_
+ mic_ctx_t *mic_ctx =
+ get_per_dev_ctx(ep->remote_dev->sd_node - 1);
+ BUG_ON(!ep->remote_dev->sd_node);
+#endif
+ while (true) {
+ if (!(err = allocate_dma_channel((struct mic_dma_ctx_t *)
+#ifdef _MIC_SCIF_
+ mic_dma_handle,
+#else
+ mic_ctx->dma_handle,
+#endif
+ chan)))
+ break;
+ schedule();
+ if (time_after(jiffies,
+ ts + NODE_ALIVE_TIMEOUT)) {
+ err = -EBUSY;
+ goto error;
+ }
+ }
+ mic_dma_thread_free_chan(*chan);
+ }
+error:
+ mutex_unlock(&ep->rma_info.rma_lock);
+ return err;
+}
+
+/*
+ * micscif_prog_signal:
+ * @epd - Endpoint Descriptor
+ * @offset - registered address
+ * @val - Value to be programmed in SUD.
+ * @type - Type of the window.
+ *
+ * Program a status update descriptor adter ensuring that the offset
+ * provided is indeed valid.
+ */
+int micscif_prog_signal(scif_epd_t epd, off_t offset, uint64_t val,
+ enum rma_window_type type)
+{
+ struct endpt *ep = (struct endpt *)epd;
+ struct dma_channel *chan = ep->rma_info.dma_chan;
+ struct reg_range_t *window = NULL;
+ struct micscif_rma_req req;
+ int err;
+ dma_addr_t phys;
+
+ mutex_lock(&ep->rma_info.rma_lock);
+ req.out_window = &window;
+ req.offset = offset;
+ req.nr_bytes = sizeof(uint64_t);
+ req.prot = SCIF_PROT_WRITE;
+ req.type = WINDOW_SINGLE;
+ if (RMA_WINDOW_SELF == type)
+ req.head = &ep->rma_info.reg_list;
+ else
+ req.head = &ep->rma_info.remote_reg_list;
+ /* Does a valid window exist? */
+ if ((err = micscif_query_window(&req))) {
+ printk(KERN_ERR "%s %d err %d\n",
+ __func__, __LINE__, err);
+ goto unlock_ret;
+ }
+ RMA_MAGIC(window);
+
+#ifndef _MIC_SCIF_
+ if (unlikely(is_self_scifdev(ep->remote_dev))) {
+ void *dst_virt;
+ if (RMA_WINDOW_SELF == type)
+ dst_virt = get_local_va(offset, window,
+ sizeof(uint32_t));
+ else {
+ struct page **pages = ((struct reg_range_t *)
+ (window->peer_window))->pinned_pages->pages;
+ int page_nr = (int) ( (offset - window->offset) >> PAGE_SHIFT );
+ off_t page_off = offset & ~PAGE_MASK;
+ dst_virt = (void *)((uint64_t)phys_to_virt(page_to_phys(
+ pages[page_nr])) | page_off);
+ }
+ *(uint64_t*)dst_virt = val;
+ goto unlock_ret;
+ }
+#endif
+ phys = micscif_get_dma_addr(window, offset, NULL, NULL, NULL);
+ if ((err = request_dma_channel(chan)))
+ goto unlock_ret;
+ err = do_status_update(chan, phys, val);
+ free_dma_channel(chan);
+unlock_ret:
+ mutex_unlock(&ep->rma_info.rma_lock);
+ return err;
+}
+
+/*
+ * __micscif_kill_apps_with_mmaps:
+ * @ep - The SCIF endpoint
+ *
+ * Kill the applications which have valid remote memory mappings
+ * created via scif_mmap(..).
+ */
+static void __micscif_kill_apps_with_mmaps(struct endpt *ep)
+{
+ struct list_head *item;
+ struct rma_task_info *info;
+
+ spin_lock(&ep->lock);
+ list_for_each(item, &ep->rma_info.task_list) {
+ info = list_entry(item, struct rma_task_info, list_member);
+ kill_pid(info->pid, SIGKILL, 1);
+ pr_debug("%s ep %p pid %p ref %d\n",
+ __func__, ep, info->pid, info->ref_count);
+ }
+ spin_unlock(&ep->lock);
+}
+
+/*
+ * _micscif_kill_apps_with_mmaps:
+ * @node - remote node id.
+ * @head - head of the list of endpoints to kill.
+ *
+ * Traverse the list of endpoints for a particular remote node and
+ * kill applications with valid remote memory mappings.
+ */
+static void _micscif_kill_apps_with_mmaps(int node, struct list_head *head)
+{
+ struct endpt *ep;
+ unsigned long sflags;
+ struct list_head *item;
+
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ list_for_each(item, head) {
+ ep = list_entry(item, struct endpt, list);
+ if (ep->remote_dev->sd_node == node)
+ __micscif_kill_apps_with_mmaps(ep);
+ }
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+}
+
+/*
+ * micscif_kill_apps_with_mmaps:
+ * @node - remote node id.
+ *
+ * Wrapper for killing applications with valid remote memory mappings
+ * for a particular node. This API is called by peer nodes as part of
+ * handling a lost node.
+ */
+void micscif_kill_apps_with_mmaps(int node)
+{
+ _micscif_kill_apps_with_mmaps(node, &ms_info.mi_connected);
+ _micscif_kill_apps_with_mmaps(node, &ms_info.mi_disconnected);
+}
+
+/*
+ * micscif_query_apps_with_mmaps:
+ * @node - remote node id.
+ * @head - head of the list of endpoints to query.
+ *
+ * Query if any applications for a remote node have valid remote memory
+ * mappings.
+ */
+static bool micscif_query_apps_with_mmaps(int node, struct list_head *head)
+{
+ struct endpt *ep;
+ unsigned long sflags;
+ struct list_head *item;
+ bool ret = false;
+
+ spin_lock_irqsave(&ms_info.mi_connlock, sflags);
+ list_for_each(item, head) {
+ ep = list_entry(item, struct endpt, list);
+ if (ep->remote_dev->sd_node == node &&
+ !list_empty(&ep->rma_info.task_list)) {
+ ret = true;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&ms_info.mi_connlock, sflags);
+ return ret;
+}
+
+/*
+ * micscif_rma_do_apps_have_mmaps:
+ * @node - remote node id.
+ *
+ * Wrapper for querying if any applications have remote memory mappings
+ * for a particular node.
+ */
+bool micscif_rma_do_apps_have_mmaps(int node)
+{
+ return (micscif_query_apps_with_mmaps(node, &ms_info.mi_connected) ||
+ micscif_query_apps_with_mmaps(node, &ms_info.mi_disconnected));
+}
+
+/*
+ * __micscif_cleanup_rma_for_zombies:
+ * @ep - The SCIF endpoint
+ *
+ * This API is only called while handling a lost node:
+ * a) Remote node is dead.
+ * b) All endpoints with remote memory mappings have been killed.
+ * So we can traverse the remote_reg_list without any locks. Since
+ * the window has not yet been unregistered we can drop the ref count
+ * and queue it to the cleanup thread.
+ */
+static void __micscif_cleanup_rma_for_zombies(struct endpt *ep)
+{
+ struct list_head *pos, *tmp;
+ struct reg_range_t *window;
+
+ list_for_each_safe(pos, tmp, &ep->rma_info.remote_reg_list) {
+ window = list_entry(pos, struct reg_range_t, list_member);
+ /* If unregistration is complete then why is it on the list? */
+ WARN_ON(window->unreg_state == OP_COMPLETED);
+ if (window->ref_count)
+ put_window_ref_count(window, window->nr_pages);
+ if (!window->ref_count) {
+ atomic_inc(&ep->rma_info.tw_refcount);
+ atomic_add_return((int32_t)window->nr_pages,
+ &ep->rma_info.tw_total_pages);
+ list_del(&window->list_member);
+ micscif_queue_for_cleanup(window, &ms_info.mi_rma);
+ }
+ }
+}
+
+/*
+ * micscif_cleanup_rma_for_zombies:
+ * @node - remote node id.
+ *
+ * Cleanup remote registration lists for zombie endpoints.
+ */
+void micscif_cleanup_rma_for_zombies(int node)
+{
+ struct endpt *ep;
+ unsigned long sflags;
+ struct list_head *item;
+
+ spin_lock_irqsave(&ms_info.mi_eplock, sflags);
+ list_for_each(item, &ms_info.mi_zombie) {
+ ep = list_entry(item, struct endpt, list);
+ if (ep->remote_dev && ep->remote_dev->sd_node == node) {
+ /*
+ * If the zombie endpoint remote node matches the lost
+ * node then the scifdev should not be alive.
+ */
+ WARN_ON(scifdev_alive(ep));
+ __micscif_cleanup_rma_for_zombies(ep);
+ }
+ }
+ spin_unlock_irqrestore(&ms_info.mi_eplock, sflags);
+}
+
+/*
+ * micscif_rma_get_task:
+ *
+ * Store the parent task struct and bump up the number of remote mappings.
+ * If this is the first remote memory mapping for this endpoint then
+ * create a new rma_task_info entry in the epd task list.
+ */
+int micscif_rma_get_task(struct endpt *ep, int nr_pages)
+{
+ struct list_head *item;
+ struct rma_task_info *info;
+ int err = 0;
+
+ spin_lock(&ep->lock);
+ list_for_each(item, &ep->rma_info.task_list) {
+ info = list_entry(item, struct rma_task_info, list_member);
+ if (info->pid == task_tgid(current)) {
+ info->ref_count += nr_pages;
+ pr_debug("%s ep %p existing pid %p ref %d\n",
+ __func__, ep, info->pid, info->ref_count);
+ goto unlock;
+ }
+ }
+ spin_unlock(&ep->lock);
+
+ /* A new task is mapping this window. Create a new entry */
+ if (!(info = kzalloc(sizeof(*info), GFP_KERNEL))) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ info->pid = get_pid(task_tgid(current));
+ info->ref_count = nr_pages;
+ pr_debug("%s ep %p new pid %p ref %d\n",
+ __func__, ep, info->pid, info->ref_count);
+ spin_lock(&ep->lock);
+ list_add_tail(&info->list_member, &ep->rma_info.task_list);
+unlock:
+ spin_unlock(&ep->lock);
+done:
+ return err;
+}
+
+/*
+ * micscif_rma_put_task:
+ *
+ * Bump down the number of remote mappings. if the ref count for this
+ * particular task drops to zero then remove the rma_task_info from
+ * the epd task list.
+ */
+void micscif_rma_put_task(struct endpt *ep, int nr_pages)
+{
+ struct list_head *item;
+ struct rma_task_info *info;
+
+ spin_lock(&ep->lock);
+ list_for_each(item, &ep->rma_info.task_list) {
+ info = list_entry(item, struct rma_task_info, list_member);
+ if (info->pid == task_tgid(current)) {
+ info->ref_count -= nr_pages;
+ pr_debug("%s ep %p pid %p ref %d\n",
+ __func__, ep, info->pid, info->ref_count);
+ if (!info->ref_count) {
+ list_del(&info->list_member);
+ put_pid(info->pid);
+ kfree(info);
+ }
+ goto done;
+ }
+ }
+ /* Why was the task not found? This is a bug. */
+ WARN_ON(1);
+done:
+ spin_unlock(&ep->lock);
+ return;
+}
+
+/* Only debug API's below */
+void micscif_display_window(struct reg_range_t *window, const char *s, int line)
+{
+ int j;
+
+ printk("%s %d window %p type %d temp %d offset 0x%llx"
+ " nr_pages 0x%llx nr_contig_chunks 0x%llx"
+ " prot %d ref_count %d magic 0x%llx peer_window 0x%llx"
+ " unreg_state 0x%x va_for_temp %p\n",
+ s, line, window, window->type, window->temp,
+ window->offset, window->nr_pages, window->nr_contig_chunks,
+ window->prot, window->ref_count, window->magic,
+ window->peer_window, window->unreg_state, window->va_for_temp);
+
+ for (j = 0; j < window->nr_contig_chunks; j++)
+ pr_debug("page[%d] = dma_addr 0x%llx num_pages 0x%x\n",
+ j,
+ window->dma_addr[j],
+ window->num_pages[j]);
+
+ if (RMA_WINDOW_SELF == window->type && window->pinned_pages)
+ for (j = 0; j < window->nr_pages; j++)
+ pr_debug("page[%d] = pinned_pages %p address %p\n",
+ j, window->pinned_pages->pages[j],
+ page_address(window->pinned_pages->pages[j]));
+
+#ifdef CONFIG_ML1OM
+ if (window->temp_phys_addr)
+ for (j = 0; j < window->nr_contig_chunks; j++)
+ pr_debug("page[%d] = temp_phys_addr 0x%llx\n",
+ j, window->temp_phys_addr[j]);
+ if (window->phys_addr)
+ for (j = 0; j < window->nr_pages; j++)
+ pr_debug("page[%d] = phys_addr 0x%llx\n",
+ j, window->phys_addr[j]);
+#endif
+ RMA_MAGIC(window);
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "mic/micscif.h"
+#include "mic/micscif_smpt.h"
+#include "mic/mic_dma_api.h"
+#include "mic/micscif_kmem_cache.h"
+#include "mic/micscif_rma.h"
+#include "mic/micscif_rma_list.h"
+#if !defined(WINDOWS) && !defined(CONFIG_PREEMPT)
+#include <linux/sched.h>
+#endif
+#include <linux/highmem.h>
+#ifndef _MIC_SCIF_
+#include "mic_common.h"
+#endif
+
+static __always_inline
+void *get_local_va(off_t off, struct reg_range_t *window, size_t len)
+{
+ uint64_t page_nr = (off - window->offset) >> PAGE_SHIFT;
+ off_t page_off = off & ~PAGE_MASK;
+ void *va;
+
+ if (RMA_WINDOW_SELF == window->type) {
+ struct page **pages = window->pinned_pages->pages;
+ va = (void *)((uint64_t)
+ (page_address(pages[page_nr])) | page_off);
+ } else {
+ dma_addr_t phys = micscif_get_dma_addr(window, off, NULL, NULL, NULL);
+#ifdef CONFIG_ML1OM
+ if (RMA_ERROR_CODE == phys)
+ return NULL;
+#endif
+ va = (void *)((uint64_t) (phys_to_virt(phys)));
+ }
+ return va;
+}
+
+#ifdef _MIC_SCIF_
+static __always_inline
+void *ioremap_remote(off_t off, struct reg_range_t *window,
+ size_t len, bool loopback, struct micscif_dev *dev, int *index, uint64_t *start_off)
+{
+ void *ret;
+ dma_addr_t phys = micscif_get_dma_addr(window, off, NULL, index, start_off);
+
+#ifdef CONFIG_ML1OM
+ if (RMA_ERROR_CODE == phys)
+ return NULL;
+#endif
+ if (!loopback)
+ ret = ioremap_nocache(phys, len);
+ else
+ ret = (void *)((uint64_t)phys_to_virt(phys));
+ return ret;
+}
+
+static __always_inline
+void *ioremap_remote_gtt(off_t off, struct reg_range_t *window,
+ size_t len, bool loopback, struct micscif_dev *dev, int ch_num, struct mic_copy_work *work)
+{
+ return ioremap_remote(off, window, len, loopback, dev, NULL, NULL);
+}
+#else
+static __always_inline
+void *ioremap_remote_gtt(off_t off, struct reg_range_t *window,
+ size_t len, bool loopback, struct micscif_dev *dev, int ch_num, struct mic_copy_work *work)
+{
+ void *ret;
+ uint64_t page_nr = (off - window->offset) >> PAGE_SHIFT;
+ off_t page_off = off & ~PAGE_MASK;
+ if (!loopback) {
+ dma_addr_t phys = micscif_get_dma_addr(window, off, NULL, NULL, NULL);
+ /* Ideally there should be a helper to do the +/-1 */
+ ret = get_per_dev_ctx(dev->sd_node - 1)->aper.va + phys;
+ } else {
+ struct page **pages = ((struct reg_range_t *)
+ (window->peer_window))->pinned_pages->pages;
+ ret = (void *)((uint64_t)phys_to_virt(page_to_phys(pages[page_nr]))
+ | page_off);
+ }
+ return ret;
+}
+
+static __always_inline
+void *ioremap_remote(off_t off, struct reg_range_t *window,
+ size_t len, bool loopback, struct micscif_dev *dev, int *index, uint64_t *start_off)
+{
+ void *ret;
+ int page_nr = (int)((off - window->offset) >> PAGE_SHIFT);
+ off_t page_off = off & ~PAGE_MASK;
+
+ if (!loopback) {
+ dma_addr_t phys;
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(dev->sd_node - 1);
+ phys = micscif_get_dma_addr(window, off, NULL, index, start_off);
+ ret = mic_ctx->aper.va + phys;
+ } else {
+ struct page **pages = ((struct reg_range_t *)
+ (window->peer_window))->pinned_pages->pages;
+ ret = (void *)((uint64_t)phys_to_virt(page_to_phys(pages[page_nr]))
+ | page_off);
+ }
+ return ret;
+}
+#endif
+
+static __always_inline void
+iounmap_remote(void *virt, size_t size, struct mic_copy_work *work)
+{
+#ifdef _MIC_SCIF_
+ if (!work->loopback)
+ iounmap(virt);
+#endif
+}
+
+/*
+ * Takes care of ordering issue caused by
+ * 1. Hardware: Only in the case of cpu copy from host to card because of WC memory.
+ * 2. Software: If memcpy reorders copy instructions for optimization. This could happen
+ * at both host and card.
+ */
+static inline void ordered_memcpy(volatile char *dst,
+ const char *src, size_t count)
+{
+ if (!count)
+ return;
+
+ memcpy_toio(dst, src, --count);
+ wmb();
+ *(dst + count) = *(src + count);
+}
+
+static inline void micscif_unaligned_memcpy(volatile char *dst,
+ const char *src, size_t count, bool ordered)
+{
+ if (unlikely(ordered))
+ ordered_memcpy(dst, src, count);
+ else
+ memcpy_toio(dst, src, count);
+}
+
+/*
+ * Copy between rma window and temporary buffer
+ */
+void micscif_rma_local_cpu_copy(uint64_t offset, struct reg_range_t *window, uint8_t *temp, size_t remaining_len, bool to_temp)
+{
+ void *window_virt;
+ size_t loop_len;
+ int offset_in_page;
+ uint64_t end_offset;
+ struct list_head *item;
+
+ BUG_ON(RMA_WINDOW_SELF != window->type);
+
+ offset_in_page = offset & ~PAGE_MASK;
+ loop_len = PAGE_SIZE - offset_in_page;
+
+ if (remaining_len < loop_len)
+ loop_len = remaining_len;
+
+ if (!(window_virt = get_local_va(offset, window, loop_len)))
+ return;
+ if (to_temp)
+ memcpy(temp, window_virt, loop_len);
+ else
+ memcpy(window_virt, temp, loop_len);
+
+ offset += loop_len;
+ temp += loop_len;
+ remaining_len -= loop_len;
+
+ end_offset = window->offset +
+ (window->nr_pages << PAGE_SHIFT);
+ while (remaining_len) {
+ if (offset == end_offset) {
+ item = (
+ &window->list_member)->next;
+ window = list_entry(item,
+ struct reg_range_t,
+ list_member);
+ end_offset = window->offset +
+ (window->nr_pages << PAGE_SHIFT);
+ }
+
+ loop_len = min(PAGE_SIZE, remaining_len);
+
+ if (!(window_virt = get_local_va(offset, window, loop_len)))
+ return;
+
+ if (to_temp)
+ memcpy(temp, window_virt, loop_len);
+ else
+ memcpy(window_virt, temp, loop_len);
+
+ offset += loop_len;
+ temp += loop_len;
+ remaining_len -= loop_len;
+ }
+}
+
+/*
+ * Comment this
+ *
+ */
+static int micscif_rma_list_dma_copy_unaligned(struct mic_copy_work *work, uint8_t *temp, struct dma_channel *chan, bool src_local)
+{
+ struct dma_completion_cb *comp_cb = work->comp_cb;
+ dma_addr_t window_dma_addr, temp_dma_addr;
+#ifndef _MIC_SCIF_
+ dma_addr_t temp_phys = comp_cb->temp_phys;
+#endif
+ size_t loop_len, nr_contig_bytes = 0, remaining_len = work->len;
+ int offset_in_page;
+ uint64_t end_offset = 0, offset = 0;
+ struct reg_range_t *window = NULL;
+ struct list_head *item = NULL;
+ int ret = 0;
+ void *window_virt_addr = NULL;
+ size_t tail_len = 0;
+
+ if (src_local) {
+ offset = work->dst_offset;
+ window = work->dst_window;
+ } else {
+ offset = work->src_offset;
+ window = work->src_window;
+ }
+
+ offset_in_page = offset & (L1_CACHE_BYTES - 1);
+ if (offset_in_page) {
+ loop_len = L1_CACHE_BYTES - offset_in_page;
+ loop_len = min(loop_len, remaining_len);
+
+ if (!(window_virt_addr = ioremap_remote_gtt(offset, window, loop_len,
+ work->loopback, work->remote_dev,
+ get_chan_num(chan), work)))
+ return -ENOMEM;
+
+ if (src_local) {
+ micscif_unaligned_memcpy(window_virt_addr, temp, loop_len, work->ordered &&
+ !(remaining_len - loop_len));
+ serializing_request(window_virt_addr);
+ } else {
+ memcpy_fromio(temp, window_virt_addr, loop_len);
+ serializing_request(temp);
+ }
+#ifdef RMA_DEBUG
+ atomic_long_add_return(loop_len, &ms_info.rma_unaligned_cpu_cnt);
+#endif
+ smp_mb();
+ iounmap_remote(window_virt_addr, loop_len, work);
+
+ offset += loop_len;
+ temp += loop_len;
+#ifndef _MIC_SCIF_
+ temp_phys += loop_len;
+#endif
+ remaining_len -= loop_len;
+ }
+
+ offset_in_page = offset & ~PAGE_MASK;
+ end_offset = window->offset +
+ (window->nr_pages << PAGE_SHIFT);
+
+ tail_len = remaining_len & (L1_CACHE_BYTES - 1);
+ remaining_len -= tail_len;
+ while (remaining_len) {
+ if (offset == end_offset) {
+ item = (&window->list_member)->next;
+ window = list_entry(item,
+ struct reg_range_t,
+ list_member);
+ end_offset = window->offset +
+ (window->nr_pages << PAGE_SHIFT);
+ }
+#ifndef _MIC_SCIF_
+ temp_dma_addr = temp_phys;
+#else
+ temp_dma_addr = (dma_addr_t)virt_to_phys(temp);
+#endif
+ window_dma_addr = micscif_get_dma_addr(window, offset, &nr_contig_bytes, NULL, NULL);
+
+#ifdef CONFIG_ML1OM
+ if (RMA_ERROR_CODE == window_dma_addr)
+ return -ENXIO;
+#endif
+ loop_len = min(nr_contig_bytes, remaining_len);
+
+ if (src_local) {
+ if (unlikely(work->ordered && !tail_len &&
+ !(remaining_len - loop_len) &&
+ loop_len != L1_CACHE_BYTES)) {
+ /*
+ * Break up the last chunk of the transfer into two steps
+ * if there is no tail to gurantee DMA ordering.
+ * Passing DO_DMA_POLLING inserts a status update descriptor
+ * in step 1 which acts as a double sided synchronization
+ * fence for the DMA engine to ensure that the last cache line
+ * in step 2 is updated last.
+ */
+ /* Step 1) DMA: Body Length - L1_CACHE_BYTES. */
+ ret = do_dma(chan, DO_DMA_POLLING, temp_dma_addr, window_dma_addr,
+ loop_len - L1_CACHE_BYTES, NULL);
+ if (ret < 0) {
+ printk(KERN_ERR "%s %d Desc Prog Failed ret %d\n",
+ __func__, __LINE__, ret);
+ return ret;
+ }
+ offset += (loop_len - L1_CACHE_BYTES);
+ temp_dma_addr += (loop_len - L1_CACHE_BYTES);
+ window_dma_addr += (loop_len - L1_CACHE_BYTES);
+ remaining_len -= (loop_len - L1_CACHE_BYTES);
+ loop_len = remaining_len;
+
+ /* Step 2) DMA: L1_CACHE_BYTES */
+ ret = do_dma(chan, 0, temp_dma_addr, window_dma_addr,
+ loop_len, NULL);
+ if (ret < 0) {
+ printk(KERN_ERR "%s %d Desc Prog Failed ret %d\n",
+ __func__, __LINE__, ret);
+ return ret;
+ }
+ } else {
+ int flags = 0;
+ if (remaining_len == loop_len + L1_CACHE_BYTES)
+ flags = DO_DMA_POLLING;
+ ret = do_dma(chan, flags, temp_dma_addr, window_dma_addr,
+ loop_len, NULL);
+ }
+ } else {
+ ret = do_dma(chan, 0, window_dma_addr, temp_dma_addr,
+ loop_len, NULL);
+ }
+ if (ret < 0) {
+ printk(KERN_ERR "%s %d Desc Prog Failed ret %d\n",
+ __func__, __LINE__, ret);
+ return ret;
+ }
+ offset += loop_len;
+ temp += loop_len;
+#ifndef _MIC_SCIF_
+ temp_phys += loop_len;
+#endif
+ remaining_len -= loop_len;
+ offset_in_page = 0;
+ }
+ if (tail_len) {
+ if (offset == end_offset) {
+ item = (&window->list_member)->next;
+ window = list_entry(item,
+ struct reg_range_t,
+ list_member);
+ end_offset = window->offset +
+ (window->nr_pages << PAGE_SHIFT);
+ }
+ if (!(window_virt_addr = ioremap_remote_gtt(offset, window, tail_len,
+ work->loopback, work->remote_dev,
+ get_chan_num(chan), work)))
+ return -ENOMEM;
+
+ /*
+ * The CPU copy for the tail bytes must be initiated only once previous
+ * DMA transfers for this endpoint have completed to guarantee
+ * ordering.
+ */
+ if (unlikely(work->ordered)) {
+ free_dma_channel(chan);
+ work->dma_chan_released = true;
+ if ((ret = drain_dma_intr(chan)))
+ return ret;
+ }
+
+ if (src_local) {
+ micscif_unaligned_memcpy(window_virt_addr, temp, tail_len, work->ordered);
+ serializing_request(window_virt_addr);
+ } else {
+ memcpy_fromio(temp, window_virt_addr, tail_len);
+ serializing_request(temp);
+ }
+#ifdef RMA_DEBUG
+ atomic_long_add_return(tail_len, &ms_info.rma_unaligned_cpu_cnt);
+#endif
+ smp_mb();
+ iounmap_remote(window_virt_addr, tail_len, work);
+ }
+ if (work->dma_chan_released) {
+ if ((ret = request_dma_channel(chan)))
+ return ret;
+ /* Callee frees the DMA channel lock, if it is held */
+ work->dma_chan_released = false;
+ }
+ ret = do_dma(chan, DO_DMA_INTR, 0, 0, 0, comp_cb);
+ if (ret < 0) {
+ printk(KERN_ERR "%s %d Desc Prog Failed ret %d\n",
+ __func__, __LINE__, ret);
+ return ret;
+ }
+ return 0;
+}
+
+static inline bool is_local_dma_addr(uint64_t addr)
+{
+#ifdef _MIC_SCIF_
+ return (addr >> PAGE_SHIFT < num_physpages);
+#else
+ return is_syspa(addr);
+#endif
+}
+
+/*
+ * micscif_rma_list_dma_copy_aligned:
+ *
+ * Traverse all the windows and perform DMA copy.
+ */
+static int micscif_rma_list_dma_copy_aligned(struct mic_copy_work *work, struct dma_channel *chan)
+{
+ dma_addr_t src_dma_addr, dst_dma_addr;
+ size_t loop_len, remaining_len, tail_len, src_contig_bytes = 0, dst_contig_bytes = 0;
+ int src_cache_off, dst_cache_off, src_last_index = 0, dst_last_index = 0;
+ uint64_t end_src_offset, end_dst_offset;
+ void *src_virt, *dst_virt;
+ struct reg_range_t *src_window = work->src_window;
+ struct reg_range_t *dst_window = work->dst_window;
+ uint64_t src_offset = work->src_offset, dst_offset = work->dst_offset;
+ uint64_t src_start_offset = src_window->offset, dst_start_offset = dst_window->offset;
+ struct list_head *item;
+ int ret = 0;
+
+ remaining_len = work->len;
+
+ src_cache_off = src_offset & (L1_CACHE_BYTES - 1);
+ dst_cache_off = dst_offset & (L1_CACHE_BYTES - 1);
+ if (src_cache_off != dst_cache_off) {
+ BUG_ON(1);
+ } else if (src_cache_off != 0) {
+ /* Head */
+ loop_len = L1_CACHE_BYTES - src_cache_off;
+ loop_len = min(loop_len, remaining_len);
+ src_dma_addr = micscif_get_dma_addr(src_window, src_offset, NULL, NULL, NULL);
+ dst_dma_addr = micscif_get_dma_addr(dst_window, dst_offset, NULL, NULL, NULL);
+#ifdef CONFIG_ML1OM
+ if (RMA_ERROR_CODE == src_dma_addr)
+ return -ENXIO;
+ if (RMA_ERROR_CODE == dst_dma_addr)
+ return -ENXIO;
+ get_window_ref_count(src_window, 1);
+ get_window_ref_count(dst_window, 1);
+#endif
+ if (is_local_dma_addr(src_dma_addr))
+ src_virt = get_local_va(src_offset, src_window, loop_len);
+ else
+ src_virt = ioremap_remote_gtt(src_offset, src_window,
+ loop_len, work->loopback,
+ work->remote_dev, get_chan_num(chan), work);
+ if (!src_virt) {
+#ifdef CONFIG_ML1OM
+ put_window_ref_count(src_window, 1);
+ put_window_ref_count(dst_window, 1);
+#endif
+ return -ENOMEM;
+ }
+ if (is_local_dma_addr(dst_dma_addr))
+ dst_virt = get_local_va(dst_offset, dst_window, loop_len);
+ else
+ dst_virt = ioremap_remote_gtt(dst_offset, dst_window,
+ loop_len, work->loopback,
+ work->remote_dev, get_chan_num(chan), work);
+#ifdef CONFIG_ML1OM
+ put_window_ref_count(src_window, 1);
+ put_window_ref_count(dst_window, 1);
+#endif
+ if (!dst_virt) {
+ if (!is_local_dma_addr(src_dma_addr))
+ iounmap_remote(src_virt, loop_len, work);
+ return -ENOMEM;
+ }
+ if (is_local_dma_addr(src_dma_addr)){
+ micscif_unaligned_memcpy(dst_virt, src_virt, loop_len,
+ remaining_len == loop_len ? work->ordered : false);
+ }
+ else{
+ memcpy_fromio(dst_virt, src_virt, loop_len);
+ }
+ serializing_request(dst_virt);
+ smp_mb();
+ if (!is_local_dma_addr(src_dma_addr))
+ iounmap_remote(src_virt, loop_len, work);
+ if (!is_local_dma_addr(dst_dma_addr))
+ iounmap_remote(dst_virt, loop_len, work);
+ src_offset += loop_len;
+ dst_offset += loop_len;
+ remaining_len -= loop_len;
+ }
+
+ end_src_offset = src_window->offset +
+ (src_window->nr_pages << PAGE_SHIFT);
+ end_dst_offset = dst_window->offset +
+ (dst_window->nr_pages << PAGE_SHIFT);
+ tail_len = remaining_len & (L1_CACHE_BYTES - 1);
+ remaining_len -= tail_len;
+ while (remaining_len) {
+ if (src_offset == end_src_offset) {
+ item = (&src_window->list_member)->next;
+ src_window = list_entry(item,
+ struct reg_range_t,
+ list_member);
+ end_src_offset = src_window->offset +
+ (src_window->nr_pages << PAGE_SHIFT);
+ src_last_index = 0;
+ src_start_offset = src_window->offset;
+ }
+ if (dst_offset == end_dst_offset) {
+ item = (&dst_window->list_member)->next;
+ dst_window = list_entry(item, struct reg_range_t, list_member);
+ end_dst_offset = dst_window->offset +
+ (dst_window->nr_pages << PAGE_SHIFT);
+ dst_last_index = 0;
+ dst_start_offset = dst_window->offset;
+ }
+
+ /* compute dma addresses for transfer */
+ src_dma_addr = micscif_get_dma_addr(src_window, src_offset, &src_contig_bytes, &src_last_index, &src_start_offset);
+ dst_dma_addr = micscif_get_dma_addr(dst_window, dst_offset, &dst_contig_bytes, &dst_last_index, &dst_start_offset);
+#ifdef CONFIG_ML1OM
+ if (RMA_ERROR_CODE == src_dma_addr)
+ return -ENXIO;
+ if (RMA_ERROR_CODE == dst_dma_addr)
+ return -ENXIO;
+#endif
+ loop_len = min(src_contig_bytes, dst_contig_bytes);
+ loop_len = min(loop_len, remaining_len);
+ if (unlikely(work->ordered && !tail_len &&
+ !(remaining_len - loop_len) &&
+ loop_len != L1_CACHE_BYTES)) {
+ /*
+ * Break up the last chunk of the transfer into two steps
+ * if there is no tail to gurantee DMA ordering.
+ * Passing DO_DMA_POLLING inserts a status update descriptor
+ * in step 1 which acts as a double sided synchronization
+ * fence for the DMA engine to ensure that the last cache line
+ * in step 2 is updated last.
+ */
+ /* Step 1) DMA: Body Length - L1_CACHE_BYTES. */
+ ret = do_dma(chan, DO_DMA_POLLING, src_dma_addr, dst_dma_addr,
+ loop_len - L1_CACHE_BYTES, NULL);
+ if (ret < 0) {
+ printk(KERN_ERR "%s %d Desc Prog Failed ret %d\n",
+ __func__, __LINE__, ret);
+ return ret;
+ }
+ src_offset += (loop_len - L1_CACHE_BYTES);
+ dst_offset += (loop_len - L1_CACHE_BYTES);
+ src_dma_addr += (loop_len - L1_CACHE_BYTES);
+ dst_dma_addr += (loop_len - L1_CACHE_BYTES);
+ remaining_len -= (loop_len - L1_CACHE_BYTES);
+ loop_len = remaining_len;
+
+ /* Step 2) DMA: L1_CACHE_BYTES */
+ ret = do_dma(chan, 0, src_dma_addr, dst_dma_addr,
+ loop_len, NULL);
+ if (ret < 0) {
+ printk(KERN_ERR "%s %d Desc Prog Failed ret %d\n",
+ __func__, __LINE__, ret);
+ return ret;
+ }
+ } else {
+ int flags = 0;
+ if (remaining_len == loop_len + L1_CACHE_BYTES)
+ flags = DO_DMA_POLLING;
+ ret = do_dma(chan, flags, src_dma_addr, dst_dma_addr,
+ loop_len, NULL);
+ if (ret < 0) {
+ printk(KERN_ERR "%s %d Desc Prog Failed ret %d\n",
+ __func__, __LINE__, ret);
+ return ret;
+ }
+ }
+ src_offset += loop_len;
+ dst_offset += loop_len;
+ remaining_len -= loop_len;
+ }
+#ifdef CONFIG_MK1OM
+ BUG_ON(remaining_len != 0);
+#endif
+#ifdef CONFIG_ML1OM
+ if (remaining_len)
+ return - ENXIO;
+#endif
+ remaining_len = tail_len;
+ if (remaining_len) {
+ loop_len = remaining_len;
+ if (src_offset == end_src_offset) {
+ item = (&src_window->list_member)->next;
+ src_window = list_entry(item,
+ struct reg_range_t,
+ list_member);
+ }
+ if (dst_offset == end_dst_offset) {
+ item = (&dst_window->list_member)->next;
+ dst_window = list_entry(item, struct reg_range_t, list_member);
+ }
+
+ src_dma_addr = micscif_get_dma_addr(src_window, src_offset, NULL, NULL, NULL);
+ dst_dma_addr = micscif_get_dma_addr(dst_window, dst_offset, NULL, NULL, NULL);
+#ifdef CONFIG_ML1OM
+ if (RMA_ERROR_CODE == src_dma_addr)
+ return -ENXIO;
+ if (RMA_ERROR_CODE == dst_dma_addr)
+ return -ENXIO;
+#endif
+ /*
+ * The CPU copy for the tail bytes must be initiated only once previous
+ * DMA transfers for this endpoint have completed to guarantee
+ * ordering.
+ */
+ if (unlikely(work->ordered)) {
+ free_dma_channel(chan);
+ work->dma_chan_released = true;
+ if ((ret = drain_dma_poll(chan)))
+ return ret;
+ }
+#ifdef CONFIG_ML1OM
+ get_window_ref_count(src_window, 1);
+ get_window_ref_count(dst_window, 1);
+#endif
+ if (is_local_dma_addr(src_dma_addr))
+ src_virt = get_local_va(src_offset, src_window, loop_len);
+ else
+ src_virt = ioremap_remote_gtt(src_offset, src_window,
+ loop_len, work->loopback,
+ work->remote_dev, get_chan_num(chan), work);
+ if (!src_virt) {
+#ifdef CONFIG_ML1OM
+ put_window_ref_count(src_window, 1);
+ put_window_ref_count(dst_window, 1);
+#endif
+ return -ENOMEM;
+ }
+
+ if (is_local_dma_addr(dst_dma_addr))
+ dst_virt = get_local_va(dst_offset, dst_window, loop_len);
+ else
+ dst_virt = ioremap_remote_gtt(dst_offset, dst_window,
+ loop_len, work->loopback,
+ work->remote_dev, get_chan_num(chan), work);
+#ifdef CONFIG_ML1OM
+ put_window_ref_count(src_window, 1);
+ put_window_ref_count(dst_window, 1);
+#endif
+ if (!dst_virt) {
+ if (!is_local_dma_addr(src_dma_addr))
+ iounmap_remote(src_virt, loop_len, work);
+ return -ENOMEM;
+ }
+
+ if (is_local_dma_addr(src_dma_addr)){
+ micscif_unaligned_memcpy(dst_virt, src_virt, loop_len, work->ordered);
+ }
+ else{
+ memcpy_fromio(dst_virt, src_virt, loop_len);
+ }
+ serializing_request(dst_virt);
+ smp_mb();
+ if (!is_local_dma_addr(src_dma_addr))
+ iounmap_remote(src_virt, loop_len, work);
+
+ if (!is_local_dma_addr(dst_dma_addr))
+ iounmap_remote(dst_virt, loop_len, work);
+
+ remaining_len -= loop_len;
+#ifdef CONFIG_MK1OM
+ BUG_ON(remaining_len != 0);
+#endif
+#ifdef CONFIG_ML1OM
+ if (remaining_len)
+ return - ENXIO;
+#endif
+ }
+
+ return ret;
+}
+
+int micscif_rma_list_dma_copy_wrapper(struct endpt *epd, struct mic_copy_work *work, struct dma_channel *chan, off_t loffset)
+{
+ int src_cache_off, dst_cache_off;
+ uint64_t src_offset = work->src_offset, dst_offset = work->dst_offset;
+ uint8_t *temp = NULL;
+ bool src_local = true, dst_local = false;
+ struct dma_completion_cb *comp_cb;
+ dma_addr_t src_dma_addr, dst_dma_addr;
+#ifndef _MIC_SCIF_
+ struct pci_dev *pdev;
+#endif
+
+ src_cache_off = src_offset & (L1_CACHE_BYTES - 1);
+ dst_cache_off = dst_offset & (L1_CACHE_BYTES - 1);
+ if (dst_cache_off == src_cache_off)
+ return micscif_rma_list_dma_copy_aligned(work, chan);
+
+ if (work->loopback) {
+#ifdef _MIC_SCIF_
+ BUG_ON(micscif_rma_list_cpu_copy(work));
+ return 0;
+#else
+ BUG_ON(1);
+#endif
+ }
+
+ src_dma_addr = micscif_get_dma_addr(work->src_window, src_offset, NULL, NULL, NULL);
+ dst_dma_addr = micscif_get_dma_addr(work->dst_window, dst_offset, NULL, NULL, NULL);
+
+ if (is_local_dma_addr(src_dma_addr))
+ src_local = true;
+ else
+ src_local = false;
+
+ if (is_local_dma_addr(dst_dma_addr))
+ dst_local = true;
+ else
+ dst_local = false;
+
+ dst_local = dst_local;
+ BUG_ON(work->len + (L1_CACHE_BYTES << 1) > KMEM_UNALIGNED_BUF_SIZE);
+
+ /* Allocate dma_completion cb */
+ if (!(comp_cb = kzalloc(sizeof(*comp_cb), GFP_KERNEL)))
+ goto error;
+
+ work->comp_cb = comp_cb;
+ comp_cb->cb_cookie = (uint64_t)comp_cb;
+ comp_cb->dma_completion_func = &micscif_rma_completion_cb;
+
+ if (work->len + (L1_CACHE_BYTES << 1) < KMEM_UNALIGNED_BUF_SIZE) {
+ comp_cb->is_cache = false;
+ if (!(temp = kmalloc(work->len + (L1_CACHE_BYTES << 1), GFP_KERNEL)))
+ goto free_comp_cb;
+ comp_cb->temp_buf_to_free = temp;
+ /* kmalloc(..) does not guarantee cache line alignment */
+ if ((uint64_t)temp & (L1_CACHE_BYTES - 1))
+ temp = (uint8_t*)ALIGN((uint64_t)temp, L1_CACHE_BYTES);
+ } else {
+ comp_cb->is_cache = true;
+ if (!(temp = micscif_kmem_cache_alloc()))
+ goto free_comp_cb;
+ comp_cb->temp_buf_to_free = temp;
+ }
+
+ if (src_local) {
+ temp += dst_cache_off;
+ comp_cb->tmp_offset = dst_cache_off;
+ micscif_rma_local_cpu_copy(work->src_offset, work->src_window, temp, work->len, true);
+ } else {
+ comp_cb->dst_window = work->dst_window;
+ comp_cb->dst_offset = work->dst_offset;
+ work->src_offset = work->src_offset - src_cache_off;
+ comp_cb->len = work->len;
+ work->len = ALIGN(work->len + src_cache_off, L1_CACHE_BYTES);
+ comp_cb->header_padding = src_cache_off;
+ }
+ comp_cb->temp_buf = temp;
+
+#ifndef _MIC_SCIF_
+ micscif_pci_dev(work->remote_dev->sd_node, &pdev);
+ comp_cb->temp_phys = mic_map_single(work->remote_dev->sd_node - 1,
+ pdev, temp, KMEM_UNALIGNED_BUF_SIZE);
+
+ if (mic_map_error(comp_cb->temp_phys)) {
+ goto free_temp_buf;
+ }
+
+ comp_cb->remote_node = work->remote_dev->sd_node;
+#endif
+ if (0 > micscif_rma_list_dma_copy_unaligned(work, temp, chan, src_local))
+ goto free_temp_buf;
+ if (!src_local)
+ work->fence_type = DO_DMA_INTR;
+ return 0;
+free_temp_buf:
+ if (comp_cb->is_cache)
+ micscif_kmem_cache_free(comp_cb->temp_buf_to_free);
+ else
+ kfree(comp_cb->temp_buf_to_free);
+free_comp_cb:
+ kfree(comp_cb);
+error:
+ printk(KERN_ERR "Unable to malloc %s %d\n", __func__, __LINE__);
+ return -ENOMEM;
+}
+
+#if !defined(WINDOWS) && !defined(CONFIG_PREEMPT)
+static int softlockup_threshold = 60;
+static void avert_softlockup(unsigned long data)
+{
+ *(unsigned long*)data = 1;
+}
+
+/*
+ * Add a timer to handle the case of hogging the cpu for
+ * time > softlockup_threshold.
+ * Add the timer every softlockup_threshold / 3 so that even if
+ * there is a huge delay in running our timer, we will still don't hit
+ * the softlockup case.(softlockup_tick() is run in hardirq() context while
+ * timers are run at softirq context)
+ *
+ */
+static inline void add_softlockup_timer(struct timer_list *timer, unsigned long *data)
+{
+ setup_timer(timer, avert_softlockup, (unsigned long) data);
+ timer->expires = jiffies + usecs_to_jiffies(softlockup_threshold * 1000000 / 3);
+ add_timer(timer);
+}
+
+static inline void del_softlockup_timer(struct timer_list *timer)
+{
+ /* We need delete synchronously since the variable being touched by
+ * timer interrupt is on the stack
+ */
+ del_timer_sync(timer);
+}
+#endif
+
+/*
+ * micscif_rma_list_cpu_copy:
+ *
+ * Traverse all the windows and perform CPU copy.
+ */
+int micscif_rma_list_cpu_copy(struct mic_copy_work *work)
+{
+ void *src_virt, *dst_virt;
+ size_t loop_len, remaining_len;
+ int src_cache_off, dst_cache_off;
+ uint64_t src_offset = work->src_offset, dst_offset = work->dst_offset;
+ struct reg_range_t *src_window = work->src_window;
+ struct reg_range_t *dst_window = work->dst_window;
+ uint64_t end_src_offset, end_dst_offset;
+ struct list_head *item;
+ int srcchunk_ind = 0;
+ int dstchunk_ind = 0;
+ uint64_t src_start_offset, dst_start_offset;
+ int ret = 0;
+#if !defined(WINDOWS) && !defined(CONFIG_PREEMPT)
+ unsigned long timer_fired = 0;
+ struct timer_list timer;
+ int cpu = smp_processor_id();
+ add_softlockup_timer(&timer, &timer_fired);
+#endif
+
+ remaining_len = work->len;
+ src_start_offset = src_window->offset;
+ dst_start_offset = dst_window->offset;
+
+ while (remaining_len) {
+#if !defined(WINDOWS) && !defined(CONFIG_PREEMPT)
+ /* Ideally we should call schedule only if we didn't sleep
+ * in between. But there is no way to know that.
+ */
+ if (timer_fired) {
+ timer_fired = 0;
+ if (smp_processor_id() == cpu)
+ touch_softlockup_watchdog();
+ else
+ cpu = smp_processor_id();
+ add_softlockup_timer(&timer, &timer_fired);
+ }
+#endif
+ src_cache_off = src_offset & ~PAGE_MASK;
+ dst_cache_off = dst_offset & ~PAGE_MASK;
+ loop_len = PAGE_SIZE -
+ ((src_cache_off > dst_cache_off) ?
+ src_cache_off : dst_cache_off);
+ if (remaining_len < loop_len)
+ loop_len = remaining_len;
+
+ if (RMA_WINDOW_SELF == src_window->type)
+ src_virt = get_local_va(src_offset, src_window, loop_len);
+ else
+ src_virt = ioremap_remote(src_offset,
+ src_window, loop_len, work->loopback, work->remote_dev, &srcchunk_ind, &src_start_offset);
+ if (!src_virt) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ if (RMA_WINDOW_SELF == dst_window->type)
+ dst_virt = get_local_va(dst_offset, dst_window, loop_len);
+ else
+ dst_virt = ioremap_remote(dst_offset,
+ dst_window, loop_len, work->loopback, work->remote_dev, &dstchunk_ind, &dst_start_offset);
+ if (!dst_virt) {
+ if (RMA_WINDOW_PEER == src_window->type)
+ iounmap_remote(src_virt, loop_len, work);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ if (work->loopback)
+ memcpy(dst_virt, src_virt, loop_len);
+ else {
+
+ if (RMA_WINDOW_SELF == src_window->type){
+ memcpy_toio(dst_virt, src_virt, loop_len);
+ }
+ else{
+ memcpy_fromio(dst_virt, src_virt, loop_len);
+ }
+ serializing_request(dst_virt);
+ smp_mb();
+ }
+ if (RMA_WINDOW_PEER == src_window->type)
+ iounmap_remote(src_virt, loop_len, work);
+
+ if (RMA_WINDOW_PEER == dst_window->type)
+ iounmap_remote(dst_virt, loop_len, work);
+
+ src_offset += loop_len;
+ dst_offset += loop_len;
+ remaining_len -= loop_len;
+ if (remaining_len) {
+ end_src_offset = src_window->offset +
+ (src_window->nr_pages << PAGE_SHIFT);
+ end_dst_offset = dst_window->offset +
+ (dst_window->nr_pages << PAGE_SHIFT);
+ if (src_offset == end_src_offset) {
+ item = (
+ &src_window->list_member)->next;
+ src_window = list_entry(item,
+ struct reg_range_t,
+ list_member);
+ srcchunk_ind = 0;
+ src_start_offset = src_window->offset;
+ }
+ if (dst_offset == end_dst_offset) {
+ item = (
+ &dst_window->list_member)->next;
+ dst_window = list_entry(item,
+ struct reg_range_t,
+ list_member);
+ dstchunk_ind = 0;
+ dst_start_offset = dst_window->offset;
+ }
+ }
+ }
+error:
+#if !defined(WINDOWS) && !defined(CONFIG_PREEMPT)
+ del_softlockup_timer(&timer);
+#endif
+ return ret;
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include "mic/micscif.h"
+#include "mic/micscif_smpt.h"
+#include "mic/mic_dma_api.h"
+#include "mic/micscif_kmem_cache.h"
+#ifdef CONFIG_MMU_NOTIFIER
+#include <linux/mmu_notifier.h>
+#include <linux/highmem.h>
+#endif
+#ifndef _MIC_SCIF_
+#include "mic_common.h"
+#endif
+#include "mic/micscif_map.h"
+
+/*
+ * micscif_insert_tcw:
+ *
+ * Insert a temp window to the temp registration list sorted by va_for_temp.
+ * RMA lock must be held.
+ */
+void micscif_insert_tcw(struct reg_range_t *window,
+ struct list_head *head)
+{
+ struct reg_range_t *curr = NULL, *prev = NULL;
+ struct list_head *item;
+ BUG_ON(!window);
+ INIT_LIST_HEAD(&window->list_member);
+ /*
+ * HSD 4845254
+ * Hack for worst case performance
+ * Compare with tail and if the new entry is new tail add it to the end
+ */
+ if (!list_empty(head)) {
+ curr = list_entry(head->prev, struct reg_range_t, list_member);
+ if ((uint64_t) curr->va_for_temp < (uint64_t) window->va_for_temp) {
+ list_add_tail(&window->list_member, head);
+ return;
+ }
+ }
+ /*
+ * We don't need the if(!prev) code but I am gonna leave it as
+ * is for now. If someone touches the above code it is likely that they
+ * will miss that they have to add if(!prev) block
+ */
+ list_for_each(item, head) {
+ curr = list_entry(item, struct reg_range_t, list_member);
+ if ((uint64_t) curr->va_for_temp > (uint64_t) window->va_for_temp)
+ break;
+ prev = curr;
+ }
+ if (!prev)
+ list_add(&window->list_member, head);
+ else
+ list_add(&window->list_member, &prev->list_member);
+}
+/*
+ * micscif_insert_window:
+ *
+ * Insert a window to the self registration list sorted by offset.
+ * RMA lock must be held.
+ */
+void micscif_insert_window(struct reg_range_t *window, struct list_head *head)
+{
+ struct reg_range_t *curr = NULL, *prev = NULL;
+ struct list_head *item;
+ BUG_ON(!window);
+ INIT_LIST_HEAD(&window->list_member);
+ list_for_each(item, head) {
+ curr = list_entry(item, struct reg_range_t, list_member);
+ if (curr->offset > window->offset)
+ break;
+ prev = curr;
+ }
+ if (!prev)
+ list_add(&window->list_member, head);
+ else
+ list_add(&window->list_member, &prev->list_member);
+}
+
+/*
+ * micscif_query_tcw:
+ *
+ * Query the temp cached registration list of ep and check if a valid contiguous
+ * range of windows exist.
+ * If there is a partial overlap, delete the existing window and create a new one
+ * that encompasses the previous window and a new range
+ * RMA lock must be held.
+ */
+int micscif_query_tcw(struct endpt *ep, struct micscif_rma_req *req)
+{
+ struct list_head *item, *temp;
+ struct reg_range_t *window;
+ uint64_t start_va_window, start_va_req = (uint64_t) req->va_for_temp;
+ uint64_t end_va_window, end_va_req = start_va_req + req->nr_bytes;
+
+ /*
+ * HSD 4845254
+ * Hack for the worst case scenario
+ * Avoid traversing the entire list to find out that there is no
+ * entry that matches
+ */
+ if (!list_empty(req->head)) {
+ temp = req->head->prev;
+ window = list_entry(temp,
+ struct reg_range_t, list_member);
+ end_va_window = (uint64_t) window->va_for_temp +
+ (window->nr_pages << PAGE_SHIFT);
+ if (start_va_req > end_va_window)
+ return -ENXIO;
+ }
+ list_for_each_safe(item, temp, req->head) {
+ window = list_entry(item,
+ struct reg_range_t, list_member);
+ start_va_window = (uint64_t) window->va_for_temp;
+ end_va_window = (uint64_t) window->va_for_temp +
+ (window->nr_pages << PAGE_SHIFT);
+ pr_debug("%s %d start_va_window 0x%llx end_va_window 0x%llx"
+ " start_va_req 0x%llx end_va_req 0x%llx req->nr_bytes 0x%lx\n",
+ __func__, __LINE__, start_va_window, end_va_window,
+ start_va_req, end_va_req, req->nr_bytes);
+ if (start_va_req < start_va_window) {
+ if (end_va_req < start_va_window) {
+ /* No overlap */
+ } else {
+ if ((window->prot & req->prot) != req->prot) {
+
+ } else {
+ req->nr_bytes += ((end_va_req > end_va_window) ? 0:(end_va_window - end_va_req));
+ pr_debug("%s %d Extend req->va_for_temp %p req->nr_byte 0x%lx\n",
+ __func__, __LINE__, req->va_for_temp, req->nr_bytes);
+ }
+ __micscif_rma_destroy_tcw_helper(window);
+ }
+ break;
+ } else {
+ if (start_va_req > end_va_window) {
+ /* No overlap */
+ continue;
+ } else {
+ if ((window->prot & req->prot) != req->prot) {
+ __micscif_rma_destroy_tcw_helper(window);
+ break;
+ }
+ if (end_va_req > end_va_window) {
+ req->va_for_temp = (void*) start_va_window;
+ req->nr_bytes = end_va_req - start_va_window;
+ pr_debug("%s %d Extend req->va_for_temp %p req->nr_byte 0x%lx\n",
+ __func__, __LINE__, req->va_for_temp, req->nr_bytes);
+ __micscif_rma_destroy_tcw_helper(window);
+ return -ENXIO;
+ } else {
+ *(req->out_window) = window;
+ return 0;
+ }
+ }
+ }
+ }
+ pr_debug("%s %d ENXIO\n", __func__, __LINE__);
+ return -ENXIO;
+}
+
+/*
+ * micscif_query_window:
+ *
+ * Query the registration list and check if a valid contiguous
+ * range of windows exist.
+ * RMA lock must be held.
+ */
+int micscif_query_window(struct micscif_rma_req *req)
+{
+ struct list_head *item;
+ struct reg_range_t *window;
+ uint64_t end_offset, offset = req->offset;
+ uint64_t tmp_min, nr_bytes_left = req->nr_bytes;
+
+ list_for_each(item, req->head) {
+ window = list_entry(item,
+ struct reg_range_t, list_member);
+ end_offset = window->offset +
+ (window->nr_pages << PAGE_SHIFT);
+ if (offset < window->offset)
+ /* Offset not found! */
+ return -ENXIO;
+ if (offset < end_offset) {
+ /* Check read/write protections. */
+ if ((window->prot & req->prot) != req->prot)
+ return -EPERM;
+ if (nr_bytes_left == req->nr_bytes)
+ /* Store the first window */
+ *(req->out_window) = window;
+ tmp_min = min(end_offset - offset, nr_bytes_left);
+ nr_bytes_left -= tmp_min;
+ offset += tmp_min;
+ /*
+ * Range requested encompasses
+ * multiple windows contiguously.
+ */
+ if (!nr_bytes_left) {
+ /* Done for partial window */
+ if (req->type == WINDOW_PARTIAL ||
+ req->type == WINDOW_SINGLE)
+ return 0;
+ /* Extra logic for full windows */
+ if (offset == end_offset)
+ /* Spanning multiple whole windows */
+ return 0;
+ /* Not spanning multiple whole windows */
+ return -ENXIO;
+ }
+ if (req->type == WINDOW_SINGLE)
+ break;
+ }
+ }
+ printk(KERN_ERR "%s %d ENXIO\n", __func__, __LINE__);
+ return -ENXIO;
+}
+
+/*
+ * micscif_rma_list_mmap:
+ *
+ * Traverse the remote registration list starting from start_window:
+ * 1) Check read/write protections.
+ * 2) Create VtoP mappings via remap_pfn_range(..)
+ * 3) Once step 1) and 2) complete successfully then traverse the range of
+ * windows again and bump the reference count.
+ * RMA lock must be held.
+ */
+int micscif_rma_list_mmap(struct reg_range_t *start_window,
+ uint64_t offset, int nr_pages, struct vm_area_struct *vma)
+{
+ struct list_head *item, *head;
+ uint64_t end_offset, loop_offset = offset;
+ struct reg_range_t *window;
+ int64_t start_page_nr, loop_nr_pages, nr_pages_left = nr_pages;
+ struct endpt *ep = (struct endpt *)start_window->ep;
+ int i, err = 0;
+ uint64_t j =0;
+ dma_addr_t phys_addr;
+
+ might_sleep();
+ BUG_ON(!mutex_is_locked(&ep->rma_info.rma_lock));
+
+ /* Start traversing from the previous link in the list */
+ head = ((&start_window->list_member))->prev;
+ list_for_each(item, head) {
+ window = list_entry(item, struct reg_range_t,
+ list_member);
+ end_offset = window->offset +
+ (window->nr_pages << PAGE_SHIFT);
+ start_page_nr = (loop_offset - window->offset) >> PAGE_SHIFT;
+ loop_nr_pages = min((int64_t)((end_offset - loop_offset) >> PAGE_SHIFT),
+ nr_pages_left);
+ for (i = (int)start_page_nr;
+ i < ((int)start_page_nr + (int)loop_nr_pages); i++, j++) {
+
+ phys_addr =
+#if !defined(_MIC_SCIF_) && defined(CONFIG_ML1OM)
+ is_self_scifdev(ep->remote_dev) ?
+ micscif_get_dma_addr(window, loop_offset,
+ NULL, NULL, NULL) : window->phys_addr[i];
+#else
+ get_phys_addr(micscif_get_dma_addr(window, loop_offset,
+ NULL, NULL, NULL), ep->remote_dev);
+#endif
+ /*
+ * Note:
+ * 1) remap_pfn_rnage returns an error if there is an
+ * attempt to create MAP_PRIVATE COW mappings.
+ */
+ if ((err = remap_pfn_range(vma,
+ ((vma)->vm_start) + (j * PAGE_SIZE),
+ phys_addr >> PAGE_SHIFT,
+ PAGE_SIZE,
+ ((vma)->vm_page_prot))))
+ goto error;
+ loop_offset += PAGE_SIZE;
+ }
+ nr_pages_left -= loop_nr_pages;
+ if (!nr_pages_left)
+ break;
+ }
+ BUG_ON(nr_pages_left);
+ /*
+ * No more failures expected. Bump up the ref count for all
+ * the windows. Another traversal from start_window required
+ * for handling errors encountered across windows during
+ * remap_pfn_range(..).
+ */
+ loop_offset = offset;
+ nr_pages_left = nr_pages;
+ head = (&(start_window->list_member))->prev;
+ list_for_each(item, head) {
+ window = list_entry(item, struct reg_range_t,
+ list_member);
+ end_offset = window->offset +
+ (window->nr_pages << PAGE_SHIFT);
+ start_page_nr = (loop_offset - window->offset) >> PAGE_SHIFT;
+ loop_nr_pages = min((int64_t)((end_offset - loop_offset) >> PAGE_SHIFT),
+ nr_pages_left);
+ get_window_ref_count(window, loop_nr_pages);
+ nr_pages_left -= loop_nr_pages;
+ loop_offset += (loop_nr_pages << PAGE_SHIFT);
+ if (!nr_pages_left)
+ break;
+ }
+ BUG_ON(nr_pages_left);
+error:
+ if (err)
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+ return err;
+}
+
+/*
+ * micscif_rma_list_munmap:
+ *
+ * Traverse the remote registration list starting from window:
+ * 1) Decrement ref count.
+ * 2) If the ref count drops to zero then send a SCIF_MUNMAP message to peer.
+ * RMA lock must be held.
+ */
+void micscif_rma_list_munmap(struct reg_range_t *start_window,
+ uint64_t offset, int nr_pages)
+{
+ struct list_head *item, *tmp, *head;
+ struct nodemsg msg;
+ uint64_t loop_offset = offset, end_offset;
+ int64_t loop_nr_pages, nr_pages_left = nr_pages;
+ struct endpt *ep = (struct endpt *)start_window->ep;
+ struct reg_range_t *window;
+
+ BUG_ON(!mutex_is_locked(&ep->rma_info.rma_lock));
+
+ msg.uop = SCIF_MUNMAP;
+ msg.src = ep->port;
+ loop_offset = offset;
+ nr_pages_left = nr_pages;
+ /* Start traversing from the previous link in the list */
+ head = (&(start_window->list_member))->prev;
+ list_for_each_safe(item, tmp, head) {
+ window = list_entry(item, struct reg_range_t,
+ list_member);
+ RMA_MAGIC(window);
+ end_offset = window->offset +
+ (window->nr_pages << PAGE_SHIFT);
+ loop_nr_pages = min((int64_t)((end_offset - loop_offset) >> PAGE_SHIFT),
+ nr_pages_left);
+ put_window_ref_count(window, loop_nr_pages);
+ if (!window->ref_count) {
+ if (scifdev_alive(ep))
+ drain_dma_intr(ep->rma_info.dma_chan);
+ /* Inform the peer about this munmap */
+ msg.payload[0] = window->peer_window;
+ /* No error handling for Notification messages. */
+ micscif_nodeqp_send(ep->remote_dev, &msg, ep);
+ list_del(&window->list_member);
+ /* Destroy this window from the peer's registered AS */
+ micscif_destroy_remote_window(ep, window);
+ }
+ nr_pages_left -= loop_nr_pages;
+ loop_offset += (loop_nr_pages << PAGE_SHIFT);
+ if (!nr_pages_left)
+ break;
+ }
+ BUG_ON(nr_pages_left);
+}
+
+/*
+ * micscif_rma_list_unregister:
+ *
+ * Traverse the self registration list starting from window:
+ * 1) Call micscif_unregister_window(..)
+ * RMA lock must be held.
+ */
+int micscif_rma_list_unregister(struct reg_range_t *window,
+ uint64_t offset, int nr_pages)
+{
+ struct list_head *item, *tmp, *head;
+ uint64_t end_offset;
+ int err = 0;
+ int64_t loop_nr_pages;
+ struct endpt *ep = (struct endpt *)window->ep;
+
+ BUG_ON(!mutex_is_locked(&ep->rma_info.rma_lock));
+ /* Start traversing from the previous link in the list */
+ head = (&window->list_member)->prev;
+ list_for_each_safe(item, tmp, head) {
+ window = list_entry(item, struct reg_range_t,
+ list_member);
+ RMA_MAGIC(window);
+ end_offset = window->offset +
+ (window->nr_pages << PAGE_SHIFT);
+ loop_nr_pages = min((int)((end_offset - offset) >> PAGE_SHIFT),
+ nr_pages);
+ if ((err = micscif_unregister_window(window)))
+ return err;
+ nr_pages -= (int)loop_nr_pages;
+ offset += (loop_nr_pages << PAGE_SHIFT);
+ if (!nr_pages)
+ break;
+ }
+ BUG_ON(nr_pages);
+ return 0;
+}
+
+/*
+ * micscif_unregister_all_window:
+ *
+ * Traverse all the windows in the self registration list and:
+ * 1) Call micscif_unregister_window(..)
+ * RMA lock must be held.
+ */
+int micscif_unregister_all_windows(scif_epd_t epd)
+{
+ struct list_head *item, *tmp;
+ struct reg_range_t *window;
+ struct endpt *ep = (struct endpt *)epd;
+ struct list_head *head = &ep->rma_info.reg_list;
+ int err = 0;
+
+ queue_work(ms_info.mi_misc_wq, &ms_info.mi_misc_work);
+ mutex_lock(&ep->rma_info.rma_lock);
+retry:
+ item = NULL;
+ tmp = NULL;
+ list_for_each_safe(item, tmp, head) {
+ window = list_entry(item,
+ struct reg_range_t, list_member);
+ ep->rma_info.async_list_del = 0;
+ if ((err = micscif_unregister_window(window)))
+ pr_debug("%s %d err %d\n",
+ __func__, __LINE__, err);
+ /*
+ * Need to restart list traversal if there has been
+ * an asynchronous list entry deletion.
+ */
+ if (ep->rma_info.async_list_del)
+ goto retry;
+ }
+ mutex_unlock(&ep->rma_info.rma_lock);
+
+ /*
+ * The following waits cannot be interruptible since they are
+ * from the driver release() entry point.
+ */
+ err = wait_event_timeout(ep->rma_info.fence_wq,
+ !ep->rma_info.fence_refcount, NODE_ALIVE_TIMEOUT);
+ /* Timeout firing is unexpected. Is the DMA engine hung? */
+ if (!err)
+ printk(KERN_ERR "%s %d err %d\n", __func__, __LINE__, err);
+
+#ifdef CONFIG_MMU_NOTIFIER
+ if (!list_empty(&ep->rma_info.mmn_list)) {
+ spin_lock(&ms_info.mi_rmalock);
+ list_add_tail(&ep->mmu_list, &ms_info.mi_mmu_notif_cleanup);
+ spin_unlock(&ms_info.mi_rmalock);
+ queue_work(ms_info.mi_mmu_notif_wq, &ms_info.mi_mmu_notif_work);
+ }
+#endif
+ return err;
+}
+
+/*
+ * micscif_rma_list_get_pages_check:
+ *
+ * Traverse the remote registration list and return 0 if all the
+ * scif_get_pages()/scif_put_pages() ref_counts are zero else return -1.
+ */
+int micscif_rma_list_get_pages_check(struct endpt *ep)
+{
+ struct list_head *item, *head = &ep->rma_info.remote_reg_list;
+ struct reg_range_t *window;
+ int err = 0;
+
+ mutex_lock(&ep->rma_info.rma_lock);
+ list_for_each(item, head) {
+ window = list_entry(item,
+ struct reg_range_t, list_member);
+ if (window->get_put_ref_count) {
+ err = -1;
+ break;
+ }
+ }
+ mutex_unlock(&ep->rma_info.rma_lock);
+ return err;
+}
+
+/* Only debug API's below */
+void micscif_display_all_windows(struct list_head *head)
+{
+ struct list_head *item;
+ struct reg_range_t *window;
+ pr_debug("\nWindow List Start\n");
+ list_for_each(item, head) {
+ window = list_entry(item,
+ struct reg_range_t, list_member);
+ micscif_display_window(window, __func__, __LINE__);
+ }
+ pr_debug("Window List End\n\n");
+}
--- /dev/null
+/*
+ * Implementation of select and poll
+ *
+ * Copyright 2011-2012 Intel Corporation.
+ *
+ * This file is a derivative of fs/select.c from within the Linux kernel
+ * source distribution, version 2.6.34; it has been modified (starting
+ * in May 2011) to work within the context of the SCIF driver.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA.
+ *
+ * Initial comment from fs/select.c:
+ *
+ * This file contains the procedures for the handling of select and poll
+ *
+ * Created for Linux based loosely upon Mathius Lattner's minix
+ * patches by Peter MacDonald. Heavily edited by Linus.
+ *
+ * 4 February 1994
+ * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
+ * flag set in its personality we do *not* modify the given timeout
+ * parameter to reflect time remaining.
+ *
+ * 24 January 2000
+ * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation
+ * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/hrtimer.h>
+#include <linux/module.h>
+
+#include "mic/micscif.h"
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+#include <linux/sched/rt.h>
+#endif
+
+struct poll_table_page {
+ struct poll_table_page *next;
+ struct poll_table_entry *entry;
+ struct poll_table_entry entries[0];
+};
+
+/*
+ * Estimate expected accuracy in ns from a timeval.
+ *
+ * After quite a bit of churning around, we've settled on
+ * a simple thing of taking 0.1% of the timeout as the
+ * slack, with a cap of 100 msec.
+ * "nice" tasks get a 0.5% slack instead.
+ *
+ * Consider this comment an open invitation to come up with even
+ * better solutions..
+ */
+
+#define MAX_SLACK (100 * NSEC_PER_MSEC)
+
+static long __estimate_accuracy(struct timespec *tv)
+{
+ long slack;
+ int divfactor = 1000;
+
+ if (tv->tv_sec < 0)
+ return 0;
+
+ if (task_nice(current) > 0)
+ divfactor = divfactor / 5;
+
+ if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
+ return MAX_SLACK;
+
+ slack = tv->tv_nsec / divfactor;
+ slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
+
+ if (slack > MAX_SLACK)
+ return MAX_SLACK;
+
+ return slack;
+}
+
+static long estimate_accuracy(struct timespec *tv)
+{
+ unsigned long ret;
+ struct timespec now;
+
+ /*
+ * Realtime tasks get a slack of 0 for obvious reasons.
+ */
+
+ if (rt_task(current))
+ return 0;
+
+ ktime_get_ts(&now);
+ now = timespec_sub(*tv, now);
+ ret = __estimate_accuracy(&now);
+ if (ret < current->timer_slack_ns)
+ return current->timer_slack_ns;
+ return ret;
+}
+
+#define POLL_TABLE_FULL(table) \
+ ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
+
+/*
+ * Ok, Peter made a complicated, but straightforward multiple_wait() function.
+ * I have rewritten this, taking some shortcuts: This code may not be easy to
+ * follow, but it should be free of race-conditions, and it's practical. If you
+ * understand what I'm doing here, then you understand how the linux
+ * sleep/wakeup mechanism works.
+ *
+ * Two very simple procedures, poll_wait() and poll_freewait() make all the
+ * work. poll_wait() is an inline-function defined in <linux/poll.h>,
+ * as all select/poll functions have to call it to add an entry to the
+ * poll table.
+ */
+static void __pollwait(struct file *filp __attribute__((unused)), wait_queue_head_t *wait_address,
+ poll_table *p);
+
+static void scif_poll_initwait(struct poll_wqueues *pwq)
+{
+ init_poll_funcptr(&pwq->pt, __pollwait);
+ pwq->polling_task = current;
+ pwq->triggered = 0;
+ pwq->error = 0;
+ pwq->table = NULL;
+ pwq->inline_index = 0;
+}
+
+static void free_poll_entry(struct poll_table_entry *entry)
+{
+ remove_wait_queue(entry->wait_address, &entry->wait);
+}
+
+static void scif_poll_freewait(struct poll_wqueues *pwq)
+{
+ struct poll_table_page * p = pwq->table;
+ int i;
+ for (i = 0; i < pwq->inline_index; i++)
+ free_poll_entry(pwq->inline_entries + i);
+ while (p) {
+ struct poll_table_entry *entry;
+ struct poll_table_page *old;
+
+ entry = p->entry;
+ do {
+ entry--;
+ free_poll_entry(entry);
+ } while (entry > p->entries);
+ old = p;
+ p = p->next;
+ free_page((unsigned long) old);
+ }
+}
+
+static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
+{
+ struct poll_table_page *table = p->table;
+
+ if (p->inline_index < N_INLINE_POLL_ENTRIES)
+ return p->inline_entries + p->inline_index++;
+
+ if (!table || POLL_TABLE_FULL(table)) {
+ struct poll_table_page *new_table;
+
+ new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
+ if (!new_table) {
+ p->error = -ENOMEM;
+ return NULL;
+ }
+ new_table->entry = new_table->entries;
+ new_table->next = table;
+ p->table = new_table;
+ table = new_table;
+ }
+
+ return table->entry++;
+}
+
+static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ struct poll_wqueues *pwq = wait->private;
+ DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
+
+ /*
+ * Although this function is called under waitqueue lock, LOCK
+ * doesn't imply write barrier and the users expect write
+ * barrier semantics on wakeup functions. The following
+ * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+ * and is paired with set_mb() in poll_schedule_timeout.
+ */
+ smp_wmb();
+ pwq->triggered = 1;
+
+ /*
+ * Perform the default wake up operation using a dummy
+ * waitqueue.
+ *
+ * TODO: This is hacky but there currently is no interface to
+ * pass in @sync. @sync is scheduled to be removed and once
+ * that happens, wake_up_process() can be used directly.
+ */
+ return default_wake_function(&dummy_wait, mode, sync, key);
+}
+
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ struct poll_table_entry *entry;
+
+ entry = container_of(wait, struct poll_table_entry, wait);
+ if (key && !((unsigned long)key & entry->key))
+ return 0;
+ return __pollwake(wait, mode, sync, key);
+}
+
+/* Add a new entry */
+static void __pollwait(struct file *filp __attribute__((unused)), wait_queue_head_t *wait_address,
+ poll_table *p)
+{
+ struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
+ struct poll_table_entry *entry = poll_get_entry(pwq);
+ if (!entry)
+ return;
+ entry->filp = NULL;
+ entry->wait_address = wait_address;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ entry->key = p->_key;
+#else
+ entry->key = p->key;
+#endif
+ init_waitqueue_func_entry(&entry->wait, pollwake);
+ entry->wait.private = pwq;
+ add_wait_queue(wait_address, &entry->wait);
+}
+
+int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+ ktime_t *expires, unsigned long slack)
+{
+ int rc = -EINTR;
+
+ set_current_state(state);
+ if (!pwq->triggered)
+ rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * Prepare for the next iteration.
+ *
+ * The following set_mb() serves two purposes. First, it's
+ * the counterpart rmb of the wmb in pollwake() such that data
+ * written before wake up is always visible after wake up.
+ * Second, the full barrier guarantees that triggered clearing
+ * doesn't pass event check of the next iteration. Note that
+ * this problem doesn't exist for the first iteration as
+ * add_wait_queue() has full barrier semantics.
+ */
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,2,0))
+ smp_store_mb(pwq->triggered, 0);
+#else
+ set_mb(pwq->triggered, 0);
+#endif
+
+ return rc;
+}
+
+static unsigned int scif_poll_kernel(poll_table *pwait, struct endpt *ep)
+{
+ return __scif_pollfd(NULL, pwait, ep);
+}
+
+/*
+ * Fish for pollable events on the pollfd->fd file descriptor. We're only
+ * interested in events matching the pollfd->events mask, and the result
+ * matching that mask is both recorded in pollfd->revents and returned. The
+ * pwait poll_table will be used by the fd-provided poll handler for waiting,
+ * if non-NULL.
+ */
+static inline unsigned int do_pollfd(struct scif_pollepd *pollfd, poll_table *pwait)
+{
+ unsigned int mask;
+ scif_epd_t epd;
+
+ mask = 0;
+ epd = pollfd->epd;
+ if (epd) {
+ mask = POLLNVAL;
+ mask = DEFAULT_POLLMASK;
+ if (pwait)
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
+ pwait->_key = pollfd->events | POLLERR | POLLHUP;
+#else
+ pwait->key = pollfd->events | POLLERR | POLLHUP;
+#endif
+ mask = scif_poll_kernel(pwait, epd);
+ /* Mask out unneeded events. */
+ mask &= pollfd->events | POLLERR | POLLHUP;
+ }
+ pollfd->revents = mask;
+
+ return mask;
+}
+
+static int do_poll(unsigned int nfds, struct scif_pollepd *ufds,
+ struct poll_wqueues *wait, struct timespec *end_time)
+{
+ poll_table* pt = &wait->pt;
+ ktime_t expire, *to = NULL;
+ int timed_out = 0, count = 0, i = 0;
+ unsigned long slack = 0;
+
+ /* Optimise the no-wait case */
+ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
+ pt = NULL;
+ timed_out = 1;
+ }
+
+ if (end_time && !timed_out)
+ slack = estimate_accuracy(end_time);
+
+ for (;;) {
+ for (i = 0; i < nfds; i++) {
+ /*
+ * Fish for events. If we found one, record it
+ * and kill the poll_table, so we don't
+ * needlessly register any other waiters after
+ * this. They'll get immediately deregistered
+ * when we break out and return.
+ */
+ if (do_pollfd(ufds + i, pt)) {
+ count++;
+ pt = NULL;
+ }
+ }
+ /*
+ * All waiters have already been registered, so don't provide
+ * a poll_table to them on the next loop iteration.
+ */
+ pt = NULL;
+ if (!count) {
+ count = wait->error;
+ if (signal_pending(current))
+ count = -EINTR;
+ }
+ if (count || timed_out)
+ break;
+
+ /*
+ * If this is the first loop and we have a timeout
+ * given, then we convert to ktime_t and set the to
+ * pointer to the expiry value.
+ */
+ if (end_time && !to) {
+ expire = timespec_to_ktime(*end_time);
+ to = &expire;
+ }
+
+ if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
+ timed_out = 1;
+ }
+ return count;
+}
+
+static int do_scif_poll(struct scif_pollepd *ufds, unsigned int nfds,
+ struct timespec *end_time)
+{
+ struct poll_wqueues table;
+ int epdcount;
+
+ scif_poll_initwait(&table);
+ epdcount = do_poll(nfds, ufds, &table, end_time);
+ scif_poll_freewait(&table);
+
+ return epdcount;
+}
+
+/*
+ * Add two timespec values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0)
+ */
+static struct timespec scif_timespec_add_safe(const struct timespec lhs,
+ const struct timespec rhs)
+{
+ struct timespec res;
+
+ set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
+ lhs.tv_nsec + rhs.tv_nsec);
+
+ if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
+ res.tv_sec = TIME_T_MAX;
+
+ return res;
+}
+/**
+ * poll_select_set_timeout - helper function to setup the timeout value
+ * @to: pointer to timespec variable for the final timeout
+ * @sec: seconds (from user space)
+ * @nsec: nanoseconds (from user space)
+ *
+ * Note, we do not use a timespec for the user space value here, That
+ * way we can use the function for timeval and compat interfaces as well.
+ *
+ * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
+ */
+static int scif_poll_select_set_timeout(struct timespec *to, long sec, long nsec)
+{
+ struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
+
+ if (!timespec_valid(&ts))
+ return -EINVAL;
+
+ /* Optimize for the zero timeout value here */
+ if (!sec && !nsec) {
+ to->tv_sec = to->tv_nsec = 0;
+ } else {
+ ktime_get_ts(to);
+ *to = scif_timespec_add_safe(*to, ts);
+ }
+ return 0;
+}
+
+int scif_poll(struct scif_pollepd *ufds, unsigned int nfds, long timeout_msecs)
+{
+ struct timespec end_time, *to = NULL;
+ if (timeout_msecs >= 0) {
+ to = &end_time;
+ scif_poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
+ NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
+ }
+
+ return do_scif_poll(ufds, nfds, to);
+}
+EXPORT_SYMBOL(scif_poll);
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <mic/micscif.h>
+#include <mic/micscif_smpt.h>
+#if defined(HOST) || defined(WINDOWS)
+#include "mic_common.h"
+#endif
+
+struct _mic_ctx_t;
+// Figure out which SMPT entry based on the host addr
+#define SYSTEM_ADDR_TO_SMPT(sysaddr) ((sysaddr) >> (MIC_SYSTEM_PAGE_SHIFT))
+#define HOSTMIC_PA_TO_SMPT(hostmic_pa) (((hostmic_pa) - MIC_SYSTEM_BASE)\
+ >> MIC_SYSTEM_PAGE_SHIFT)
+
+#define NUM_SMPT_ENTRIES_IN_USE 32
+#define SMPT_TO_MIC_PA(smpt_index) (MIC_SYSTEM_BASE + ((smpt_index) * \
+ MIC_SYSTEM_PAGE_SIZE))
+#define MAX_HOST_MEMORY ((NUM_SMPT_ENTRIES_IN_USE) * MIC_SYSTEM_PAGE_SIZE)
+#define MAX_SYSTEM_ADDR ((MIC_SYSTEM_BASE) + (MAX_HOST_MEMORY) - (1))
+#define IS_MIC_SYSTEM_ADDR(addr) (((addr) >= MIC_SYSTEM_BASE) && \
+ ((addr) <= MAX_SYSTEM_ADDR))
+
+#define _PAGE_OFFSET(x) ((x) & ((PAGE_SIZE) - (1ULL)))
+#define SMPT_OFFSET(x) ((x) & MIC_SYSTEM_PAGE_MASK)
+#define PAGE_ALIGN_LOW(x) ALIGN(((x) - ((PAGE_SIZE) - 1ULL)), (PAGE_SIZE))
+#define PAGE_ALIGN_HIGH(x) ALIGN((x), (PAGE_SIZE))
+#define SMPT_ALIGN_LOW(x) ALIGN(((x) - (MIC_SYSTEM_PAGE_MASK)), \
+ (MIC_SYSTEM_PAGE_SIZE))
+#define SMPT_ALIGN_HIGH(x) ALIGN((x), (MIC_SYSTEM_PAGE_SIZE))
+
+#if defined(HOST)
+#define SMPT_LOGGING 0
+#if SMPT_LOGGING
+static int64_t smpt_ref_count_g[MAX_BOARD_SUPPORTED];
+static int64_t map_count_g;
+static int64_t unmap_count_g;
+#endif
+#endif
+
+void mic_smpt_set(volatile void *mm_sbox, uint64_t dma_addr, uint64_t index)
+{
+ uint32_t smpt_reg_val = BUILD_SMPT(SNOOP_ON, dma_addr >> MIC_SYSTEM_PAGE_SHIFT);
+ writel(smpt_reg_val, (uint8_t*)mm_sbox + SBOX_SMPT00 + (4 * index));
+}
+
+#if defined(HOST)
+/*
+ * Called once per board as part of starting a MIC
+ * to restore the SMPT state to the previous values
+ * as stored in SMPT SW data structures.
+ */
+void mic_smpt_restore(mic_ctx_t *mic_ctx)
+{
+ int i;
+ dma_addr_t dma_addr;
+ uint32_t *smpt = (uint32_t*)(mic_ctx->mmio.va +
+ HOST_SBOX_BASE_ADDRESS + SBOX_SMPT00);
+ uint32_t smpt_reg_val;
+
+ for (i = 0; i < NUM_SMPT_ENTRIES_IN_USE; i++) {
+ dma_addr = mic_ctx->mic_smpt[i].dma_addr;
+ if (mic_ctx->bi_family == FAMILY_KNC) {
+ smpt_reg_val = BUILD_SMPT(SNOOP_ON,
+ dma_addr >> MIC_SYSTEM_PAGE_SHIFT);
+ writel(smpt_reg_val, &smpt[i]);
+ }
+ }
+}
+
+/*
+ * Called once per board as part of smpt init
+ * This does a 0-512G smpt mapping,
+ */
+void mic_smpt_init(mic_ctx_t *mic_ctx)
+{
+ int i;
+ dma_addr_t dma_addr;
+ uint32_t *smpt = (uint32_t*)(mic_ctx->mmio.va +
+ HOST_SBOX_BASE_ADDRESS + SBOX_SMPT00);
+ uint32_t smpt_reg_val;
+#if SMPT_LOGGING
+ smpt_ref_count_g[mic_ctx->bi_id] = 0;
+#endif
+
+ spin_lock_init(&mic_ctx->smpt_lock);
+ mic_ctx->mic_smpt = kmalloc(sizeof(mic_smpt_t)
+ * NUM_SMPT_ENTRIES_IN_USE, GFP_KERNEL);
+
+ for (i = 0; i < NUM_SMPT_ENTRIES_IN_USE; i++) {
+ dma_addr = i * MIC_SYSTEM_PAGE_SIZE;
+ mic_ctx->mic_smpt[i].dma_addr = dma_addr;
+ mic_ctx->mic_smpt[i].ref_count = 0;
+ if (mic_ctx->bi_family == FAMILY_KNC) {
+ smpt_reg_val = BUILD_SMPT(SNOOP_ON,
+ dma_addr >> MIC_SYSTEM_PAGE_SHIFT);
+ writel(smpt_reg_val, &smpt[i]);
+ }
+ }
+}
+
+/*
+ * Called during mic exit per ctx (i.e once for every board)
+ * If ref count is non-zero, then it means that some module
+ * did not call mic_unmap_single/mic_ctx_unmap_single correctly.
+ */
+void
+mic_smpt_uninit(mic_ctx_t *mic_ctx)
+{
+#if SMPT_LOGGING
+ printk("global ref count for node = %d is %lld\n",
+ mic_ctx->bi_id+1, smpt_ref_count_g[mic_ctx->bi_id]);
+ printk("mic map calls = %lld, mic unmap calls = %lld \n",
+ map_count_g, unmap_count_g);
+
+ for (i = 0; i < NUM_SMPT_ENTRIES_IN_USE; i++) {
+ printk("[smpt_san%d] smpt_entry[%d] dma_addr = 0x%llX"
+ " ref_count = %lld \n", mic_ctx->bi_id+1, i,
+ mic_ctx->mic_smpt[i].dma_addr,
+ mic_ctx->mic_smpt[i].ref_count);
+ }
+#endif
+#ifdef DEBUG
+ {
+ int i;
+ for (i = 0; i < NUM_SMPT_ENTRIES_IN_USE; i++)
+ WARN_ON(mic_ctx->mic_smpt[i].ref_count);
+ }
+#endif
+
+ kfree(mic_ctx->mic_smpt);
+ mic_ctx->mic_smpt = NULL;
+ ;
+}
+
+dma_addr_t mic_ctx_map_single(mic_ctx_t *mic_ctx, void *p, size_t size)
+{
+ struct pci_dev *hwdev = mic_ctx->bi_pdev;
+ int bid = mic_ctx->bi_id;
+
+ return mic_map_single(bid, hwdev, p, size);
+}
+
+void mic_unmap_single(int bid, struct pci_dev *hwdev, dma_addr_t mic_addr,
+ size_t size)
+{
+ dma_addr_t dma_addr = mic_to_dma_addr(bid, mic_addr);
+ mic_unmap(bid, mic_addr, size);
+ pci_unmap_single(hwdev, dma_addr, size, PCI_DMA_BIDIRECTIONAL);
+}
+
+void mic_ctx_unmap_single(mic_ctx_t *mic_ctx, dma_addr_t dma_addr,
+ size_t size)
+{
+ struct pci_dev *hwdev = mic_ctx->bi_pdev;
+ int bid = mic_ctx->bi_id;
+ mic_unmap_single(bid, hwdev, dma_addr, size);
+}
+
+dma_addr_t mic_map_single(int bid, struct pci_dev *hwdev, void *p,
+ size_t size)
+{
+ dma_addr_t mic_addr = 0;
+ dma_addr_t dma_addr;
+
+ dma_addr = pci_map_single(hwdev, p, size, PCI_DMA_BIDIRECTIONAL);
+
+ if (!pci_dma_mapping_error(hwdev, dma_addr))
+ if (!(mic_addr = mic_map(bid, dma_addr, size))) {
+ printk(KERN_ERR "mic_map failed board id %d\
+ addr %#016llx size %#016zx\n",
+ bid, dma_addr, size);
+ pci_unmap_single(hwdev, dma_addr,
+ size, PCI_DMA_BIDIRECTIONAL);
+ }
+ return mic_addr;
+}
+
+void add_smpt_entry(int spt, int64_t *ref, uint64_t dma_addr, int entries, mic_ctx_t *mic_ctx)
+{
+
+ struct nodemsg msg;
+ dma_addr_t addr = dma_addr;
+ mic_smpt_t *mic_smpt = mic_ctx->mic_smpt;
+ int dev_id = mic_ctx->bi_id + 1;
+ void *mm_sbox = mic_ctx->mmio.va + HOST_SBOX_BASE_ADDRESS;
+ int i;
+
+ for (i = spt; i < spt + entries; i++, addr += MIC_SYSTEM_PAGE_SIZE) {
+#ifdef CONFIG_ML1OM
+ /*
+ * For KNF if the ref count is 0 and the entry number is greater
+ * than 16 then we must resend a SMPT_SET message in case the uOS
+ * was rebooted and lost SMPT register state (example during host
+ * suspend/hibernate.
+ */
+ if (!mic_smpt[i].ref_count && i >= (NUM_SMPT_ENTRIES_IN_USE >> 1)) {
+#else
+ if (!mic_smpt[i].ref_count && (mic_smpt[i].dma_addr != addr)) {
+#endif
+ /*
+ * ref count was zero and dma_addr requested did not
+ * match the dma address in the table. So, this is a
+ * new entry in the table.
+ * KNF: Send a message to the card
+ * to update its smpt table with a new value.
+ * KNC: write to the SMPT registers from host since
+ * they are accessible.
+ */
+ if (mic_ctx->bi_family == FAMILY_ABR) {
+ msg.uop = SMPT_SET;
+ msg.payload[0] = addr;
+ msg.payload[1] = i;
+ msg.dst.node = scif_dev[dev_id].sd_node;
+ msg.src.node = 0;
+#if SMPT_LOGGING
+ printk("[smpt_node%d] ==> sending msg to "
+ " node = %d dma_addr = 0x%llX, entry ="
+ "0x%llX\n" , mic_ctx->bi_id + 1,
+ scif_dev[dev_id].sd_node,
+ msg.payload[0], msg.payload[1]);
+#endif
+ micscif_inc_node_refcnt(&scif_dev[dev_id], 1);
+ micscif_nodeqp_send(&scif_dev[dev_id], &msg, NULL);
+ micscif_dec_node_refcnt(&scif_dev[dev_id], 1);
+ }
+ else
+ mic_smpt_set(mm_sbox, addr, i);
+ mic_smpt[i].dma_addr = addr;
+ }
+ mic_smpt[i].ref_count += ref[i - spt];
+ }
+}
+
+dma_addr_t smpt_op(int bid, uint64_t dma_addr,
+ int entries, int64_t *ref)
+{
+ int spt = -1; /* smpt index */
+ int ee = 0; /* existing entries */
+ int fe = 0; /* free entries */
+ int i;
+ unsigned long flags;
+ dma_addr_t mic_addr = 0;
+ dma_addr_t addr = dma_addr;
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(bid);
+ mic_smpt_t *mic_smpt = mic_ctx->mic_smpt;
+
+ if (micpm_get_reference(mic_ctx, true))
+ goto exit;
+ spin_lock_irqsave(&mic_ctx->smpt_lock, flags);
+
+ /* find existing entries */
+ for (i = 0; i < NUM_SMPT_ENTRIES_IN_USE; i++) {
+ if (mic_smpt[i].dma_addr == addr) {
+ ee++;
+ addr += MIC_SYSTEM_PAGE_SIZE;
+ }
+ else if (ee) /* cannot find contiguous entries */
+ goto not_found;
+
+ if (ee == entries)
+ goto found;
+ }
+
+ /* find free entry */
+#ifdef CONFIG_ML1OM
+ /*
+ * For KNF the SMPT registers are not host accessible so we maintain a
+ * 1:1 map for SMPT registers from 0-256GB i.e. the first 16 entries and
+ * look for SMPT entries for P2P and IB etc from the 16th entry onwards.
+ * This allows the KNF card to boot on Host systems with < 256GB system
+ * memory and access VNET/SCIF buffers without crashing. P2P and IB SMPT
+ * entries are setup after SCIF driver load/reload via SCIF Node QP
+ * SMPT_SET messages.
+ */
+ for (i = NUM_SMPT_ENTRIES_IN_USE / 2 ; i < NUM_SMPT_ENTRIES_IN_USE; i++) {
+#else
+ for (i = 0 ; i < NUM_SMPT_ENTRIES_IN_USE; i++) {
+#endif
+ fe = (mic_smpt[i].ref_count == 0) ? fe + 1: 0;
+ if (fe == entries)
+ goto found;
+ }
+
+not_found:
+ spin_unlock_irqrestore(&mic_ctx->smpt_lock, flags);
+ micpm_put_reference(mic_ctx);
+exit:
+ return mic_addr;
+found:
+ spt = i - entries + 1;
+ mic_addr = SMPT_TO_MIC_PA(spt);
+ add_smpt_entry(spt, ref, dma_addr, entries, mic_ctx);
+ spin_unlock_irqrestore(&mic_ctx->smpt_lock, flags);
+ micpm_put_reference(mic_ctx);
+ return mic_addr;
+}
+
+
+/*
+ * Returns number of smpt entries needed for dma_addr to dma_addr + size
+ * also returns the reference count array for each of those entries
+ * and the starting smpt address
+ */
+int get_smpt_ref_count(int64_t *ref, dma_addr_t dma_addr, size_t size,
+ uint64_t *smpt_start)
+{
+ uint64_t start = dma_addr;
+ uint64_t end = dma_addr + size;
+ int i = 0;
+
+ while (start < end) {
+ ref[i++] = min(SMPT_ALIGN_HIGH(start + 1), end) - start;
+ start = SMPT_ALIGN_HIGH(start + 1);
+ }
+
+ if (smpt_start)
+ *smpt_start = SMPT_ALIGN_LOW(dma_addr);
+
+ return i;
+}
+
+/*
+ * Maps dma_addr to dma_addr + size memory in the smpt table
+ * of board bid
+ */
+dma_addr_t mic_map(int bid, dma_addr_t dma_addr, size_t size)
+{
+ dma_addr_t mic_addr = 0;
+ int entries;
+ int64_t ref[NUM_SMPT_ENTRIES_IN_USE];
+ uint64_t smpt_start;
+#if SMPT_LOGGING
+ unsigned long flags;
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(bid);
+ spin_lock_irqsave(&mic_ctx->smpt_lock, flags);
+ map_count_g++;
+ smpt_ref_count_g[bid] += (int64_t)size;
+ spin_unlock_irqrestore(&mic_ctx->smpt_lock, flags);
+#endif
+ if (!size)
+ return mic_addr;
+
+ /*
+ * Get number of smpt entries to be mapped, ref count array
+ * and the starting smpt address to start the search for
+ * free or existing smpt entries.
+ */
+ entries = get_smpt_ref_count(ref, dma_addr, size, &smpt_start);
+
+ /* Set the smpt table appropriately and get 16G aligned mic address */
+ mic_addr = smpt_op(bid, smpt_start, entries, ref);
+
+ /*
+ * If mic_addr is zero then its a error case
+ * since mic_addr can never be zero.
+ * else generate mic_addr by adding the 16G offset in dma_addr
+ */
+ if (!mic_addr) {
+ WARN_ON(1);
+ return mic_addr;
+ }
+ else
+ return (mic_addr + (dma_addr & MIC_SYSTEM_PAGE_MASK));
+}
+
+/*
+ * Unmaps mic_addr to mic_addr + size memory in the smpt table
+ * of board bid
+ */
+void mic_unmap(int bid, dma_addr_t mic_addr, size_t size)
+{
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(bid);
+ mic_smpt_t *mic_smpt = mic_ctx->mic_smpt;
+ int64_t ref[NUM_SMPT_ENTRIES_IN_USE];
+ int num_smpt;
+ int spt = HOSTMIC_PA_TO_SMPT(mic_addr);
+ int i;
+ unsigned long flags;
+
+ if (!size)
+ return;
+
+ if (!IS_MIC_SYSTEM_ADDR(mic_addr)) {
+ WARN_ON(1);
+ return;
+ }
+
+ /* Get number of smpt entries to be mapped, ref count array */
+ num_smpt = get_smpt_ref_count(ref, mic_addr, size, NULL);
+
+ spin_lock_irqsave(&mic_ctx->smpt_lock, flags);
+
+#if SMPT_LOGGING
+ unmap_count_g++;
+ smpt_ref_count_g[bid] -= (int64_t)size;
+#endif
+
+ for (i = spt; i < spt + num_smpt; i++) {
+ mic_smpt[i].ref_count -= ref[i - spt];
+ WARN_ON(mic_smpt[i].ref_count < 0);
+ }
+ spin_unlock_irqrestore(&mic_ctx->smpt_lock, flags);
+}
+
+dma_addr_t mic_to_dma_addr(int bid, dma_addr_t mic_addr)
+{
+ mic_ctx_t *mic_ctx = get_per_dev_ctx(bid);
+ int spt = HOSTMIC_PA_TO_SMPT(mic_addr);
+ dma_addr_t dma_addr;
+
+ if (!IS_MIC_SYSTEM_ADDR(mic_addr)) {
+ WARN_ON(1);
+ return 0;
+ }
+ dma_addr = mic_ctx->mic_smpt[spt].dma_addr + SMPT_OFFSET(mic_addr);
+ return dma_addr;
+}
+
+#endif
+
+bool is_syspa(dma_addr_t pa)
+{
+ return IS_MIC_SYSTEM_ADDR(pa);
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <mic/micscif.h>
+
+unsigned long scif_get_maxid(void);
+static ssize_t show_scif_maxid(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", ms_info.mi_maxid);
+}
+static DEVICE_ATTR(maxnode, S_IRUGO, show_scif_maxid, NULL);
+
+unsigned long scif_get_total(void);
+static ssize_t show_scif_total(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", ms_info.mi_total);
+}
+static DEVICE_ATTR(total, S_IRUGO, show_scif_total, NULL);
+
+unsigned long scif_get_nodes(void);
+static ssize_t show_scif_nodes(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ int len = 0;
+ int node;
+
+ len += snprintf(buf + len, PAGE_SIZE, "%d:", ms_info.mi_total);
+ len += snprintf(buf + len, PAGE_SIZE, "%d", ms_info.mi_nodeid);
+
+ for (node = 0; node <= ms_info.mi_maxid; node++) {
+ if (scif_dev[node].sd_state == SCIFDEV_RUNNING ||
+ scif_dev[node].sd_state == SCIFDEV_SLEEPING ||
+ is_self_scifdev(&scif_dev[node])) {
+ len += snprintf(buf + len, PAGE_SIZE, ",%d", scif_dev[node].sd_node);
+ }
+ }
+
+ len += snprintf(buf + len, PAGE_SIZE, "\n");
+ return len;
+}
+static DEVICE_ATTR(nodes, S_IRUGO, show_scif_nodes, NULL);
+
+static ssize_t show_watchdog_to(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", ms_info.mi_watchdog_to);
+}
+
+static ssize_t store_watchdog_to(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int i, ret;
+
+ if (sscanf(buf, "%d", &i) != 1)
+ goto invalid;
+
+ if (i <= 0)
+ goto invalid;
+
+ ms_info.mi_watchdog_to = i;
+ ret = strlen(buf);
+ printk("Current watchdog timeout %d seconds\n", ms_info.mi_watchdog_to);
+ goto bail;
+
+invalid:
+ printk(KERN_ERR "Attempt to set invalid watchdog timeout\n");
+ ret = -EINVAL;
+bail:
+ return ret;
+}
+static DEVICE_ATTR(watchdog_to, S_IRUGO | S_IWUSR, show_watchdog_to, store_watchdog_to);
+
+static ssize_t show_watchdog_enabled(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", ms_info.mi_watchdog_enabled);
+}
+
+static ssize_t store_watchdog_enabled(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int i, ret;
+#ifndef _MIC_SCIF_
+ struct micscif_dev *scifdev;
+ int node;
+#endif
+
+ if (sscanf(buf, "%d", &i) != 1)
+ goto invalid;
+
+ if (i < 0)
+ goto invalid;
+
+ if (i && !ms_info.mi_watchdog_enabled) {
+ ms_info.mi_watchdog_enabled = 1;
+#ifndef _MIC_SCIF_
+ for (node = 1; node <= ms_info.mi_maxid; node++) {
+ scifdev = &scif_dev[node];
+ if (scifdev->sd_ln_wq)
+ queue_delayed_work(scifdev->sd_ln_wq,
+ &scifdev->sd_watchdog_work, NODE_ALIVE_TIMEOUT);
+ }
+#endif
+ }
+
+ if (!i)
+ ms_info.mi_watchdog_enabled = 0;
+
+ ret = strlen(buf);
+ printk("Watchdog timeout enabled = %d\n", ms_info.mi_watchdog_enabled);
+ goto bail;
+invalid:
+ ret = -EINVAL;
+bail:
+ return ret;
+}
+static DEVICE_ATTR(watchdog_enabled, S_IRUGO | S_IWUSR, show_watchdog_enabled, store_watchdog_enabled);
+
+static ssize_t show_watchdog_auto_reboot(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", ms_info.mi_watchdog_auto_reboot);
+}
+
+static ssize_t store_watchdog_auto_reboot(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int i, ret;
+
+ if (sscanf(buf, "%d", &i) != 1)
+ goto invalid;
+
+ if (i < 0)
+ goto invalid;
+
+ if (i && !ms_info.mi_watchdog_auto_reboot)
+ ms_info.mi_watchdog_auto_reboot = 1;
+
+ if (!i)
+ ms_info.mi_watchdog_auto_reboot = 0;
+
+ ret = strlen(buf);
+ printk("Watchdog auto reboot enabled = %d\n", ms_info.mi_watchdog_auto_reboot);
+ goto bail;
+invalid:
+ ret = -EINVAL;
+bail:
+ return ret;
+}
+static DEVICE_ATTR(watchdog_auto_reboot, S_IRUGO | S_IWUSR, show_watchdog_auto_reboot, store_watchdog_auto_reboot);
+
+static ssize_t show_proxy_dma_threshold(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%lld\n", ms_info.mi_proxy_dma_threshold);
+}
+
+static ssize_t store_proxy_dma_threshold(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ uint64_t i;
+
+ if (sscanf(buf, "%lld", &i) != 1)
+ goto invalid;
+
+ ms_info.mi_proxy_dma_threshold = i;
+ ret = strlen(buf);
+ printk("P2P proxy DMA Threshold = %lld bytes\n", ms_info.mi_proxy_dma_threshold);
+ goto bail;
+invalid:
+ ret = -EINVAL;
+bail:
+ return ret;
+}
+static DEVICE_ATTR(proxy_dma_threshold, S_IRUGO | S_IWUSR, show_proxy_dma_threshold, store_proxy_dma_threshold);
+
+static struct attribute *scif_attributes[] = {
+ &dev_attr_maxnode.attr,
+ &dev_attr_total.attr,
+ &dev_attr_nodes.attr,
+ &dev_attr_watchdog_to.attr,
+ &dev_attr_watchdog_enabled.attr,
+ &dev_attr_watchdog_auto_reboot.attr,
+ &dev_attr_proxy_dma_threshold.attr,
+ NULL
+};
+
+struct attribute_group scif_attr_group = {
+ .attrs = scif_attributes
+};
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/* ************************************************************************* *\
+generate a virtual address for a given size
+\* ************************************************************************* */
+#include "mic/micscif.h"
+
+/* ************************************************************************* *\
+FUNCTION: VaGenAddress::Initialize
+
+DESCRIPTION: Initialize VaGenAddress to point to one node of size = range
+\* ************************************************************************* */
+static int
+va_gen_init_internal(struct va_gen_addr *addr, uint64_t range)
+{
+ struct va_node *node;
+ int err;
+
+ va_node_init(&addr->allocator);
+ if ((err = va_node_alloc(&addr->allocator, &addr->hole_list)) < 0)
+ goto init_err;
+ if (va_node_is_valid(addr->hole_list)) {
+ node = va_node_get(&addr->allocator, addr->hole_list);
+ node->next = invalid_va_node_index;
+ node->base = 0;
+ node->range = range;
+ }
+ addr->claims_list = invalid_va_node_index;
+init_err:
+ return err;
+}
+
+/* ************************************************************************* *\
+FUNCTION: VaGenAddress::Alloc
+Allocate virtual memory by searching through free virtual memory
+linked list for first range >= desired range.
+
+Note: Free list is sorted by base, we are searching for range.
+
+Return: Offset to allocated virtual address if successful (in pages).
+INVALID_VA_PAGE_INDEX if failed
+\* ************************************************************************* */
+static uint64_t
+va_gen_alloc_internal(struct va_gen_addr *addr, uint64_t range)
+{
+ //==========================================================================
+ // Search for a sufficiently large memory hole (first-fit).
+ //--------------------------------------------------------------------------
+
+ // Search for first available hole of sufficient size.
+ uint32_t index = addr->hole_list;
+ struct va_node *pFind;
+ // Used to handle case of an exact range match.
+ struct va_node *pPrev = 0;
+ uint64_t base;
+
+ if (0 == range || !va_node_is_valid(addr->hole_list))
+ return INVALID_VA_PAGE_INDEX;
+
+ pFind = va_node_get(&addr->allocator, index);
+
+ for ( ; ; ) {
+ if (pFind->range >= range)
+ break;
+ else {
+ index = pFind->next;
+ // No hole sufficiently large.
+ if (!va_node_is_valid(index))
+ return INVALID_VA_PAGE_INDEX;
+ pPrev = pFind;
+ pFind = va_node_get(&addr->allocator, index);
+ }
+ }
+
+ // Found an adequate hole. Get its base.
+ base = pFind->base;
+
+ //============================================================================
+ // Uncommon case: pFind->range == in_range
+ // Remove node from the hole list when exact fit. Note, could leave the
+ // hole list empty.
+ //----------------------------------------------------------------------------
+
+ if (pFind->range == range) {
+ // first node?
+ if (addr->hole_list == index)
+ addr->hole_list = pFind->next;
+ else {
+ BUG_ON(!pPrev);
+ pPrev->next = pFind->next;
+ }
+ va_node_free(&addr->allocator, index);
+ return base;
+ }
+
+ //================================================================================
+ // Shrink an existing node that is too large.
+ //--------------------------------------------------------------------------------
+
+ else {
+ pFind->base += range;
+ pFind->range -= range;
+ }
+
+ return base;
+}
+
+/* ************************************************************************* *\
+FUNCTION: VaGenAddress::FreeClaim
+
+DESCRIPTION:
+Removes claimed range from the claims list.
+\* ************************************************************************* */
+static void
+va_gen_free_claim(struct va_gen_addr *addr, uint64_t base, uint64_t range)
+{
+ struct va_node *pNode = 0;
+ struct va_node *pPrev = 0;
+ uint32_t index, new_index;
+ struct va_node *pNewNode;
+ int err;
+
+ if (0 == range)
+ return;
+
+ for (index = addr->claims_list; va_node_is_valid(index); index = pNode->next) {
+ pNode = va_node_get(&addr->allocator, index);
+
+ if (pNode->base <= base && pNode->base + pNode->range >= base + range) {
+ if (pNode->base == base) {
+ pNode->base += range;
+ pNode->range -= range;
+ if (0 == pNode->range) {
+ if (pPrev)
+ pPrev->next = pNode->next;
+ else
+ addr->claims_list = pNode->next;
+ va_node_free(&addr->allocator, index);
+ }
+ } else if (pNode->base + pNode->range == base + range) {
+ pNode->range -= range;
+ } else {
+ err = va_node_alloc(&addr->allocator, &new_index);
+ BUG_ON(err < 0);
+ pNewNode = va_node_get(&addr->allocator, new_index);
+ pNewNode->base = base + range;
+ pNewNode->range = pNode->range - pNewNode->base;
+ pNewNode->next = pNode->next;
+ pNode->range = base - pNode->base;
+ pNode->next = new_index;
+ }
+ return;
+ }
+ if (pNode->base > base + range) {
+ pr_debug("Freed claim not found in the list\n");
+ return;
+ }
+
+ if ((pNode->base < base) ?
+ (pNode->base + pNode->range > base) :
+ (base + range > pNode->base)) {
+ pr_debug("Freed claim partially overlaps the list\n");
+ return;
+ }
+ pPrev = pNode;
+ }
+}
+
+/* ************************************************************************* *\
+FUNCTION: VaGenAddress::InsertAndCoalesce
+
+DESCRIPTION:
+O(n) search through free list sorted by base
+should average O(n/2), and free list should be much less than the # allocated
+coalesce with node before/after if possible
+3 possible outcomes:
+1. freed node is inserted into list (0 deallocated)
+2. freed node range coalesced with existing node,
+so freed node can be deallocated (1 deallocated)
+3. freed node + another node are coalesced + deallocated
+(2 deallocated)
+Fails if there is full or partial overlap between inserted
+range and ranges in the list
+
+returns false if insert failed
+\* ************************************************************************* */
+static int
+va_gen_insert_and_coalesce(struct va_node_allocator *allocator, uint32_t *list,
+ uint64_t base, uint64_t range)
+{
+ // search through free list, insert ordered
+ // also check for coalesce
+ uint32_t findPtr = *list;
+ uint32_t prev = *list;
+ uint64_t end_range = base + range;
+ uint32_t nextPtr, ptr;
+ struct va_node *nextNode, *node;
+ int err;
+
+ while (va_node_is_valid(findPtr)) {
+ struct va_node *find = va_node_get(allocator, findPtr);
+ // overlap?
+ // A.start < B.start && A.end > B.start A-B==A-B A-B==B-A otherwise A-A B-B
+ // B.start < A.start && B.end > A.start B-A==B-A B-A==A-B otherwise B-B A-A
+ // =>
+ // A.start < B.start ? A.end > B.start : B.end > A.start
+
+ if ((find->base < base) ?
+ (find->base + find->range > base) :
+ (end_range > find->base)) {
+ return -1;
+ }
+ //----------------------------------------------------------
+ // coalesce? 2 possibilities:
+ // 1. (pFind->base + pFind->range) == current.base
+ // coalesce, check next node base = endrange,
+ // coalesce with next if possible, deallocate next, exit
+ // 2. end_range == pFind->base
+ // coalesce, exit
+ if (end_range == find->base) {
+ // pr_debug("Coalesce base %lld before %lld\n", base, find->base);
+ find->base = base;
+ find->range += range;
+ return 0;
+ } else if ((find->base + find->range) == base) {
+ // pr_debug("Coalesce base %lld after %lld\n", base, find->base);
+ // leave the base unchanged
+ find->range += range;
+ // check the next node to see if it coalesces too
+ nextPtr = find->next;
+ if (va_node_is_valid(nextPtr)) {
+ nextNode = va_node_get(allocator, nextPtr);
+ // end_range is the same after prior coalesce
+ if (nextNode->base == end_range) {
+ // pr_debug("Double Coalesce index %d before %d\n", findPtr, nextPtr);
+ find->range += nextNode->range;
+ find->next = nextNode->next;
+ va_node_free(allocator, nextPtr);
+ }
+ }
+ return 0;
+ }
+ // end coalesce
+
+ //----------------------------------------------------------
+ // insert if found a node at a greater address
+ else if (find->base > end_range)
+ // exit loop, insert node
+ break;
+ // nothing found yet, next index
+ prev = findPtr;
+ findPtr = find->next;
+ }
+
+ //----------------------------------------------------------
+ // insert or append if node
+ // could be at the end or empty free list (find index = INVALID)
+ // or, next node has larger base
+ //----------------------------------------------------------
+ err = va_node_alloc(allocator, &ptr);
+ BUG_ON(err < 0);
+ if (!va_node_is_valid(ptr)) {
+ printk(KERN_ERR "FAILED to add hole! base = %lld, range = %lld\n", base, range);
+ return 0;
+ }
+ node = va_node_get(allocator, ptr);
+ node->base = base;
+ node->range = range;
+ node->next = findPtr;
+ // First node or empty list (Alloc() can empty the list)
+ if (findPtr == *list)
+ // pr_debug("List now starts with %d\n", ptr);
+ *list = ptr;
+ else { // reached the end of the list or insertion
+ BUG_ON(!va_node_is_valid(prev));
+ // pr_debug("Append index %d after %d\n", ptr, prev);
+ (va_node_get(allocator, prev))->next = ptr;
+ }
+ return 0;
+}
+
+/* ************************************************************************* *\
+FUNCTION: VaGenAddress::Free
+
+DESCRIPTION:
+Frees allocated Virtual Address. Inserts freed range in the list of holes
+(available virtual addresses)
+\* ************************************************************************* */
+static void
+va_gen_free_internal(struct va_gen_addr *addr, uint64_t base, uint64_t range)
+{
+ int result = va_gen_insert_and_coalesce(&addr->allocator, &addr->hole_list, base, range);
+ BUG_ON(result < 0);
+}
+
+/* ************************************************************************* *\
+FUNCTION: VaGenAddress::Alloc
+Allocate virtual memory space.
+
+Note: "Quick and dirty" implementation of aligned Alloc on top of
+non-aligned Alloc.
+
+Return: Offset to allocated virtual address if successful (in pages).
+INVALID_VA_PAGE_INDEX if failed
+\* ************************************************************************* */
+static uint64_t
+va_gen_alloc_aligned(struct va_gen_addr *addr, uint64_t range, uint32_t unit_align)
+{
+ uint64_t base_address = va_gen_alloc_internal(addr, range + unit_align - 1);
+ uint64_t aligned_base = base_address;
+ if (0 == range || 0 == unit_align)
+ return INVALID_VA_PAGE_INDEX;
+ //BUG_ON(IsPowerOfTwo(in_unitAlign));
+
+ if (unit_align == 1 || base_address == INVALID_VA_PAGE_INDEX)
+ return base_address;
+
+ if (aligned_base > base_address)
+ va_gen_free_internal(addr, base_address, aligned_base - base_address);
+
+ if (aligned_base + range < base_address + unit_align - 1)
+ va_gen_free_internal(addr, aligned_base + range,
+ base_address + unit_align - 1 - aligned_base - range);
+ return aligned_base;
+}
+
+/* ************************************************************************* *\
+FUNCTION: VaGenAddress::Claim
+
+DESCRIPTION:
+Claims a SVAS range. Checks if range was claimed before; if not, records
+the claim in the claims list
+
+returns false if claim failed
+\* ************************************************************************* */
+static int
+va_gen_claim_internal(struct va_gen_addr *addr, uint64_t base, uint64_t range)
+{
+ return va_gen_insert_and_coalesce(&addr->allocator, &addr->claims_list, base, range);
+}
+
+/* ************************************************************************* *\
+FUNCTION: VaGenAddressMutex::Alloc
+Allocate virtual memory space.
+
+Note: Wrapper for unit-testable address generator to add critical
+section and convert bytes to pages.
+Note: Free() selects between Free[Alloc] and FreeClaim based on
+the address range of the freed address.
+
+Return: Allocated virtual address if successful (in bytes)
+INVALID_VA_GEN_ADDRESS if failed
+\* ************************************************************************* */
+uint64_t
+va_gen_alloc(struct va_gen_addr *addr, uint64_t num_bytes, uint32_t align_bytes)
+{
+ // Convert input bytes to pages which is our unit for the address generator.
+ uint64_t num_pages = (uint64_t)(((PAGE_SIZE - 1) + num_bytes) / PAGE_SIZE);
+ uint64_t align_pages = align_bytes / PAGE_SIZE;
+ uint64_t va_page_index, ret;
+
+ if (align_bytes < PAGE_SIZE) {
+ ret = INVALID_VA_GEN_ADDRESS;
+ WARN_ON(1);
+ goto done;
+ }
+
+ if (num_bytes > (0xffffffffULL * PAGE_SIZE)) {
+ ret = INVALID_VA_GEN_ADDRESS;
+ WARN_ON(1);
+ goto done;
+ }
+ va_page_index = va_gen_alloc_aligned(addr, num_pages, (uint32_t)(align_pages % 0xffffffff) );
+
+ if (va_page_index == INVALID_VA_PAGE_INDEX)
+ return INVALID_VA_GEN_ADDRESS;
+
+ // Convert page number to virtual address, adding base.
+ ret = va_page_index << PAGE_SHIFT;
+ ret += addr->base;
+done:
+ return ret;
+}
+
+// Claims ownership of a memory region
+uint64_t
+va_gen_claim(struct va_gen_addr *addr, uint64_t address, uint64_t num_bytes)
+{
+ uint64_t va, num_pages;
+ int result;
+
+ if (address + num_bytes > addr->base)
+ address = INVALID_VA_GEN_ADDRESS;
+ else if (address & (PAGE_SIZE - 1))
+ // address not aligned
+ address = INVALID_VA_GEN_ADDRESS;
+ else {
+ va = (uint64_t)(address >> PAGE_SHIFT);
+ // pr_debug("%s %d (%#llx,%llx)\n", __func__, __LINE__, va, num_bytes);
+ // convert input bytes to pages, our unit for the address generator
+ num_pages = (uint64_t)(((PAGE_SIZE-1) + num_bytes) / PAGE_SIZE);
+ if ((result = va_gen_claim_internal(addr, va, num_pages)) < 0)
+ address = INVALID_VA_GEN_ADDRESS;
+ }
+ return address;
+}
+
+// frees the address range so the pages may be re-assigned
+void
+va_gen_free(struct va_gen_addr *addr, uint64_t address, uint64_t num_bytes)
+{
+ uint64_t va, num_pages;
+
+ if (address >= addr->base) {
+ // convert virtual address to page number, subtracting base
+ address -= addr->base;
+ va = (uint64_t)(address >> PAGE_SHIFT);
+ // pr_debug("%s %d (%#llx,%llx)\n", __func__, __LINE__, va, num_bytes);
+ // convert input bytes to pages, our unit for the address generator
+ num_pages = (uint64_t)(((PAGE_SIZE-1) + num_bytes) / PAGE_SIZE);
+ va_gen_free_internal(addr, va, num_pages);
+ } else {
+ va = (uint64_t)(address >> PAGE_SHIFT);
+ // pr_debug("%s %d (%#llx,%llx)\n", __func__, __LINE__, va, num_bytes);
+ // convert input bytes to pages, our unit for the address generator
+ num_pages = (uint64_t)(((PAGE_SIZE-1) + num_bytes) / PAGE_SIZE);
+ va_gen_free_claim(addr, va, num_pages);
+ }
+}
+
+// base and range in bytes, though internal va generator works in pages
+int
+va_gen_init(struct va_gen_addr *addr, uint64_t base, uint64_t range)
+{
+ uint64_t rangeInPages = (uint64_t)(range >> PAGE_SHIFT);
+ int ret;
+
+ if (!(ret = va_gen_init_internal(addr, rangeInPages)))
+ addr->base = base;
+ return ret;
+}
+
+void
+va_gen_destroy(struct va_gen_addr *addr)
+{
+ va_node_destroy(&addr->allocator);
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/***************************************************************************\
+manage available nodes for VaGenAddress
+\***************************************************************************/
+#include "mic/micscif.h"
+
+/***************************************************************************\
+FUNCTION: va_node_init
+
+DESCRIPTION: constructor for allocator for GfxGenAddress
+\***************************************************************************/
+void va_node_init(struct va_node_allocator *node)
+{
+ node->pp_slab_directory = 0;
+ node->slab_shift = 7; /* 2^7 -> 128 nodes in the slab */
+ node->nodes_in_slab = 1<<node->slab_shift;
+ node->slab_mask = (node->nodes_in_slab-1);
+ node->num_slabs = 0;
+ node->num_free_slabs = 0;
+ node->free_list = invalid_va_node_index;
+}
+
+int va_node_is_valid(uint32_t index)
+{
+ return invalid_va_node_index != index;
+}
+
+/************************************************************************** *\
+FUNCTION: va_node_destroy
+
+DESCRIPTION: destructor for allocator for GfxGenAddress
+\************************************************************************** */
+void va_node_destroy(struct va_node_allocator *node)
+{
+ uint32_t i;
+ if (node->pp_slab_directory) {
+ for (i = 0; i < node->num_slabs; i++) {
+ kfree(node->pp_slab_directory[i]);
+ node->pp_slab_directory[i] = NULL;
+ }
+ kfree(node->pp_slab_directory);
+ node->pp_slab_directory = NULL;
+ }
+}
+
+/* ************************************************************************* *\
+FUNCTION: va_node_realloc
+
+DESCRIPTION: va_node_realloc to add more node arrays
+\* ************************************************************************* */
+static int va_node_realloc(struct va_node_allocator *node)
+{
+ uint32_t growSlabs = 2 * (node->num_slabs) + 1;
+ struct va_node **ppGrowDirectory =
+ kzalloc(sizeof(struct va_node *) * growSlabs, GFP_KERNEL);
+ uint32_t i;
+
+ if (!ppGrowDirectory)
+ return -ENOMEM;
+
+ if (node->num_slabs) {
+ for (i = 0; i < node->num_slabs; i++)
+ ppGrowDirectory[i] = node->pp_slab_directory[i];
+ kfree(node->pp_slab_directory);
+ node->pp_slab_directory = NULL;
+ }
+ node->pp_slab_directory = ppGrowDirectory;
+ node->num_free_slabs = growSlabs - node->num_slabs;
+ return 0;
+}
+
+/* ************************************************************************* *\
+FUNCTION: va_node_grow
+
+DESCRIPTION: add a node array
+\* ************************************************************************* */
+static int va_node_grow(struct va_node_allocator *node)
+{
+ struct va_node *pNewSlab;
+ uint32_t i, start;
+ int ret;
+
+ if (!node->num_free_slabs)
+ if ((ret = va_node_realloc(node)) < 0)
+ return ret;
+
+ pNewSlab = kzalloc(sizeof(struct va_node) *
+ node->nodes_in_slab, GFP_KERNEL);
+ if (pNewSlab)
+ node->pp_slab_directory[node->num_slabs] = pNewSlab;
+ else
+ return -ENOMEM;
+
+ /*--------------------------------------------------------
+ * add new nodes to free list
+ * slightly better than just calling free() for each index
+ */
+ start = node->num_slabs * node->nodes_in_slab;
+ for (i = 0; i < (node->nodes_in_slab-1); i++)
+ /* we could optimize this, but why bother? */
+ pNewSlab[i].next = start + i + 1;
+ /* add new allocations to start of list */
+ pNewSlab[node->nodes_in_slab-1].next = node->free_list;
+ node->free_list = start;
+ /*-------------------------------------------------------*/
+
+ /* update bookkeeping for array of arrays */
+ node->num_slabs++;
+ node->num_free_slabs--;
+ return 0;
+}
+
+/* ************************************************************************* *\
+FUNCTION: va_node_get
+
+DESCRIPTION: return a node reference from index
+\* ************************************************************************* */
+struct va_node *va_node_get(struct va_node_allocator *node, uint32_t index)
+{
+ uint32_t slabIndex = index >> node->slab_shift;
+ uint32_t nodeIndex = index & node->slab_mask;
+
+ return &node->pp_slab_directory[slabIndex][nodeIndex];
+}
+
+/* ************************************************************************* *\
+FUNCTION: va_node_alloc
+
+DESCRIPTION: return 0 on success with valid index in out_alloc or errno on failure.
+\* ************************************************************************* */
+int va_node_alloc(struct va_node_allocator *node, uint32_t *out_alloc)
+{
+ int ret;
+
+ if (!va_node_is_valid(node->free_list))
+ if ((ret = va_node_grow(node)) < 0)
+ return ret;
+ *out_alloc = node->free_list;
+ node->free_list = (va_node_get(node, *out_alloc))->next;
+ return 0;
+}
+
+/* ************************************************************************* *\
+FUNCTION: va_node_free
+
+DESCRIPTION: make a node available
+\* ************************************************************************* */
+void va_node_free(struct va_node_allocator *node, uint32_t index)
+{
+ struct va_node *tmp = va_node_get(node, index);
+ tmp->next = node->free_list;
+ node->free_list = index;
+}
--- /dev/null
+obj-m := mpssboot.o
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <scif.h>
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+
+#define ACPT_BOOTED 1
+#define ACPT_BOOT_ACK 2
+#define ACPT_NACK_VERSION 3
+#define ACPT_REQUEST_TIME 4
+#define ACPT_TIME_DATA 5
+
+#define ACPT_VERSION 1
+
+static dev_t dev;
+static struct class *class;
+static struct device *mbdev;
+
+static int host_notified;
+static struct timespec tod;
+static int timeset = 0;
+
+static ssize_t
+show_timesync(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "Time: %s\n", timeset? "set" : "not set");
+}
+
+static ssize_t
+set_synctime(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct scif_portID port = {0, MIC_NOTIFY};
+ static scif_epd_t epd;
+ int proto = ACPT_REQUEST_TIME;
+ int version = ACPT_VERSION;
+ int err;
+
+ epd = scif_open();
+
+ if ((err = scif_connect(epd, &port))) {
+ printk("MPSSBOOT error, synctime connect failed: %d\n", err);
+ goto close_synctime;
+ }
+
+ if ((err = scif_send(epd, &version, sizeof(version), 0)) != sizeof(version)) {
+ printk("MPSSBOOT send version failed: %d\n", err);
+ goto close_synctime;
+ }
+
+ if ((err = scif_send(epd, &proto, sizeof(proto), 0)) != sizeof(proto)) {
+ printk("MPSSBOOT send boot finished failed: %d\n", err);
+ goto close_synctime;
+ }
+
+ if ((err = scif_recv(epd, &proto, sizeof(proto), SCIF_RECV_BLOCK)) != sizeof(proto)) {
+ printk("MPSSBOOT protocol recv ack failed: %d\n", err);
+ goto close_synctime;
+ }
+
+ if (proto != ACPT_TIME_DATA) {
+ printk("MPSSBOOT failed to receive time data packet %d\n", proto);
+ goto close_synctime;
+ }
+
+ if ((err = scif_recv(epd, &tod, sizeof(tod), SCIF_RECV_BLOCK)) != sizeof(tod)) {
+ printk("MPSSBOOT time data read size failed: %d\n", err);
+ goto close_synctime;
+ }
+
+ do_settimeofday(&tod);
+ printk("MPSSBOOT Time of day sycned with host\n");
+ timeset = 1;
+
+close_synctime:
+ scif_close(epd);
+ return count;
+}
+static DEVICE_ATTR(synctime, S_IRUGO | S_IWUSR, show_timesync, set_synctime);
+
+static ssize_t
+show_host_notified(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", host_notified);
+}
+
+static ssize_t
+set_host_notified(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct scif_portID port = {0, MIC_NOTIFY};
+ static scif_epd_t epd;
+ int proto = ACPT_BOOTED;
+ int version = ACPT_VERSION;
+ int err;
+
+ epd = scif_open();
+
+ if ((err = scif_connect(epd, &port))) {
+ printk("MPSSBOOT error, notify connect failed: %d\n", err);
+ goto close_notify;
+ }
+
+ if ((err = scif_send(epd, &version, sizeof(version), 0)) != sizeof(version)) {
+ printk("MPSSBOOT send version failed: %d\n", err);
+ goto close_notify;
+ }
+
+ if ((err = scif_send(epd, &proto, sizeof(proto), 0)) != sizeof(proto)) {
+ printk("MPSSBOOT send boot finished failed: %d\n", err);
+ goto close_notify;
+ }
+
+ if ((err = scif_recv(epd, &proto, sizeof(proto), SCIF_RECV_BLOCK)) != sizeof(proto)) {
+ printk("MPSSBOOT protocol recv ack failed: %d\n", err);
+ goto close_notify;
+ }
+
+ if (proto != ACPT_BOOT_ACK)
+ printk("MPSSBOOT failed to receive boot ACK, got %d\n", proto);
+ else
+ printk("MPSSBOOT Boot acknowledged\n");
+
+close_notify:
+ scif_close(epd);
+ return count;
+}
+static DEVICE_ATTR(host_notified, S_IRUGO | S_IWUSR, show_host_notified, set_host_notified);
+
+static struct attribute *mb_attributes[] = {
+ &dev_attr_synctime.attr,
+ &dev_attr_host_notified.attr,
+ NULL
+};
+
+struct attribute_group mb_attr_group = {
+ .attrs = mb_attributes
+};
+
+/* This function closes the endpoint established on init */
+static void
+mpssboot_exit(void)
+{
+ sysfs_remove_group(&mbdev->kobj, &mb_attr_group);
+ device_destroy(class, dev);
+ class_destroy(class);
+}
+
+static char *
+mpssboot_devnode(struct device *dev, mode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
+}
+
+/* This function initializes a SCIF connection to the host */
+static int
+mpssboot_init(void)
+{
+ //static struct device dev;
+ int result;
+
+ alloc_chrdev_region(&dev, 0, 2, "micnotify");
+ class = class_create(THIS_MODULE, "micnotify");
+ class->devnode = mpssboot_devnode;
+ mbdev = device_create(class, NULL, dev, NULL, "notify");
+
+ result = sysfs_create_group(&mbdev->kobj, &mb_attr_group);
+ result = result;
+ return 0;
+}
+
+module_init(mpssboot_init);
+module_exit(mpssboot_exit);
+MODULE_LICENSE("GPL");
+
--- /dev/null
+obj-m := pm_scif.o
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <scif.h>
+#include <mic/mic_pm.h>
+#include <mic/micscif.h>
+#include "pm_scif.h"
+
+#define PM_DB(fmt, ...) printk(KERN_ALERT"[ %s : %d ]:"fmt,__func__, __LINE__, ##__VA_ARGS__)
+#define FUNCTION_ENTRY PM_DB("==> %s\n", __func__)
+#define FUNCTION_EXIT PM_DB("<== %s\n", __func__)
+
+#define PM_SCIF_RETRY_COUNT 5
+
+DEFINE_RWLOCK(pmscif_send);
+
+static atomic_t epinuse = ATOMIC_INIT(0);
+void pm_scif_exit(void);
+
+typedef struct _mic_pm_scif {
+ scif_epd_t ep;
+ int lport;
+ struct scif_portID rport_id;
+ struct workqueue_struct *pm_recvq;
+ struct work_struct pm_recv;
+ PM_CONNECTION_STATE con_state;
+} mic_pm_scif;
+
+mic_pm_scif *pm_scif;
+
+void
+pm_dump(char *buf, size_t len)
+{
+ int i = 0;
+
+ for ( i=0; i < len; i++) {
+
+ if (i % 8)
+ printk(KERN_ALERT"\n");
+ printk(KERN_ALERT"%x ", buf[i]);
+ }
+}
+
+static void pm_handle_open (void *msg, size_t len)
+{
+ FUNCTION_ENTRY;
+ pm_dump((char*)msg, len);
+}
+
+static void pm_handle_test (void *msg, size_t len)
+{
+ FUNCTION_ENTRY;
+ pm_dump((char*)msg, len);
+
+}
+typedef void (*_pm_msg_handler)(void*, size_t);
+
+typedef struct _pm_msg_call {
+ _pm_msg_handler handler;
+ char *name;
+}pm_msg_call;
+
+#define PM_HANDLE_ADD(opcode, function) [(opcode)] = {(function), #function}
+
+pm_msg_call pm_msg_caller[PM_MESSAGE_MAX] = {
+ PM_HANDLE_ADD(PM_MESSAGE_OPEN, pm_handle_open),
+ PM_HANDLE_ADD(PM_MESSAGE_TEST, pm_handle_test)
+};
+
+int
+pm_send_to_host(PM_MESSAGE opcode, void *msg, size_t len)
+{
+// FUNCTION_ENTRY;
+ int err = 0;
+ size_t psize = sizeof(pm_msg_header) + len;
+ char *payload;
+ unsigned long flags;
+
+ if (pm_scif->con_state != PM_CONNECTED) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ if (!(payload = kmalloc(psize, GFP_ATOMIC))) {
+ err = -ENOMEM;
+ goto error;
+ }
+ read_lock_irqsave(&pmscif_send,flags);
+
+ if (atomic_xchg(&epinuse,1) != 0) {
+ read_unlock_irqrestore(&pmscif_send,flags);
+ kfree(payload);
+ return -1;
+ }
+
+ ((pm_msg_header*)payload)->opcode = opcode;
+ ((pm_msg_header*)payload)->len = len;
+ if (len)
+ memcpy((char*)payload + sizeof(pm_msg_header), msg, len);
+
+ //0 for non blocking
+ if ((err = scif_send(pm_scif->ep, payload, psize, 0)) < 0) {
+ PM_DB("scif_recv failed\n");
+ }
+ atomic_set(&epinuse,0);
+ //for (i = 0; i < psize; i++)
+ // printk(KERN_ALERT" buff: %X\n", payload[i]);
+ read_unlock_irqrestore(&pmscif_send,flags);
+ kfree(payload);
+// FUNCTION_EXIT;
+error:
+ return err;
+}
+
+EXPORT_SYMBOL(pm_send_to_host);
+
+static struct mic_pmscif_handle micpmscif = {
+ .pm_scif_uos2host = pm_send_to_host,
+ .pm_scif_host2uos = NULL,
+ .owner = THIS_MODULE,
+};
+
+
+
+static void pm_send_to_uos(pm_msg_header *header, char *msg)
+{
+ if(micpmscif.pm_scif_host2uos) {
+ micpmscif.pm_scif_host2uos(header, msg);
+ }
+}
+
+static void
+pm_recv_from_host(struct work_struct *work)
+{
+ int err = 0;
+ char *msg = NULL;
+ pm_msg_header *header;
+ mic_pm_scif *pm_scif_info = container_of(work, mic_pm_scif, pm_recv);
+
+ FUNCTION_ENTRY;
+ if (pm_scif->con_state != PM_CONNECTED)
+ goto exit;
+
+ header = kmalloc(sizeof(pm_msg_header), GFP_KERNEL);
+
+ if ((err = scif_recv(pm_scif_info->ep, header, sizeof(pm_msg_header),
+ SCIF_RECV_BLOCK)) < 0) {
+ PM_DB("scif_recv failed\n");
+ goto end_con;
+ }
+
+ msg = kmalloc(header->len, GFP_KERNEL);
+
+ if ((err = scif_recv(pm_scif_info->ep, msg, header->len,
+ SCIF_RECV_BLOCK)) < 0) {
+ PM_DB("scif_recv failed\n");
+ goto end_con;
+ }
+ if(header->opcode < PM_MESSAGE_MAX) {
+ if ((header->opcode != PM_MESSAGE_CLOSE) &&
+ (header->opcode != PM_MESSAGE_CLOSE_ACK)) {
+ if(pm_msg_caller[header->opcode].handler)
+ pm_msg_caller[header->opcode].handler(msg, header->len);
+ pm_send_to_uos(header, msg);
+ } else {
+ if (header->opcode == PM_MESSAGE_CLOSE) {
+ pm_send_to_uos(header,msg);
+ pm_send_to_host(PM_MESSAGE_CLOSE_ACK, NULL, 0);
+ }
+ pm_scif->con_state = PM_DISCONNECTING;
+ goto end_con;
+ }
+ }
+ else
+ printk("pm_scif: Recvd scif message with bad opcode %d\n",
+ header->opcode);
+ kfree(header);
+ kfree(msg);
+ queue_work(pm_scif->pm_recvq, &pm_scif->pm_recv);
+ return;
+
+end_con:
+ kfree(header);
+ kfree(msg);
+exit:
+ FUNCTION_EXIT;
+}
+
+#ifdef PM_SCIF_IOCTL
+static int
+spm_ioctl(struct inode *in, struct file *f, unsigned int cmd, unsigned long arg)
+{
+ int i = 0;
+ uint32_t payload = 0xc0de0000;
+
+ FUNCTION_ENTRY;
+ for (i = 0; i < PM_MESSAGE_TEST; i++) {
+ payload++;
+ //PM_DB("sending %s with payload = %x \n",
+ // pm_msg_caller[i].name, payload);
+ pm_send_to_host(i, &payload, sizeof(payload));
+ }
+
+ return 0;
+}
+
+static long
+spm_unlocked_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+ return (long) spm_ioctl(f->f_path.dentry->d_inode, f, cmd, arg);
+}
+
+static int
+spm_release(struct inode *in, struct file *f)
+{
+ return 0;
+}
+
+static char *
+spm_devnode(struct device *dev, mode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "spm/%s", dev_name(dev));
+}
+
+
+static int
+spm_open(struct inode *in, struct file *f)
+{
+ return 0;
+}
+
+struct file_operations spm_ops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = spm_unlocked_ioctl,
+ .open = spm_open,
+ .release = spm_release,
+};
+
+int spm_major;
+int spm_minor;
+dev_t spmdev;
+struct cdev spmcdev;
+struct class *spmclass;
+
+static void
+spm_dev_deinit(void)
+{
+ device_destroy(spmclass,spmdev);
+ class_destroy(spmclass);
+ cdev_del(&spmcdev);
+ unregister_chrdev_region(spmdev, 1);
+}
+
+static int
+spm_dev_init(void)
+{
+ int err = 0;
+
+ if (spm_major) {
+ spmdev = MKDEV(spm_major, spm_minor);
+ err = register_chrdev_region(spmdev, 1, "spm");
+ }
+ else {
+ err = alloc_chrdev_region(&spmdev, spm_minor, 1, "spm");
+ spm_major = MAJOR(spmdev);
+ }
+
+ if (err < 0) {
+ unregister_chrdev_region(spmdev, 1);
+ goto done;
+ }
+
+ spmdev = MKDEV(spm_major, spm_minor);
+ cdev_init(&spmcdev, &spm_ops);
+ spmcdev.owner = THIS_MODULE;
+ err = cdev_add(&spmcdev, spmdev, 1);
+
+ if (err)
+ goto err;
+
+ spmclass = class_create(THIS_MODULE, "spm");
+ if (IS_ERR(spmclass)) {
+ err = PTR_ERR(spmclass);
+ goto err;
+ }
+
+ spmclass->devnode = spm_devnode;
+ device_create(spmclass, NULL, spmdev, NULL, "spm");
+ if (IS_ERR(spmclass)) {
+ err = PTR_ERR(spmclass);
+ goto err;
+ }
+done:
+ return err;
+err:
+ spm_dev_deinit();
+ return err;
+}
+#endif
+
+int pm_scif_init(void)
+{
+ int err = 1;
+ int retry = 0;
+
+ FUNCTION_ENTRY;
+ PM_DB("pm_scif insmoded \n");
+#ifdef PM_SCIF_IOCTL
+ if ((err = spm_dev_init())) {
+ PM_DB(" spm_dev_init failed\n");
+ goto done;
+ }
+#endif
+ atomic_set(&epinuse,0);
+ pm_scif = kzalloc(sizeof(mic_pm_scif), GFP_KERNEL);
+
+ if (!pm_scif) {
+ err = -ENOMEM;
+ goto end_con;
+ }
+
+ pm_scif_register(&micpmscif);
+
+ if ((pm_scif->ep = scif_open()) == NULL) {
+ PM_DB(" scif_open failed\n");
+ goto end_con;
+ }
+
+ if ((pm_scif->lport = scif_bind(pm_scif->ep, 0)) < 0) {
+ PM_DB(" scif_bind failed\n");
+ goto end_con;
+ }
+
+ PM_DB(" scif_bind successfull. Local port number = %d, ep = \n",
+ pm_scif->lport);
+ dump_ep(pm_scif->ep, __func__,__LINE__);
+ pm_scif->rport_id.node = 0;
+ pm_scif->rport_id.port = SCIF_PM_PORT_0;
+
+ while ((err = scif_connect(pm_scif->ep, &pm_scif->rport_id)) != 0) {
+ PM_DB(" scif_connect failed with err = %d ep %p\n",err,
+ pm_scif->ep);
+ msleep(1000);
+ if (retry++ > PM_SCIF_RETRY_COUNT)
+ goto end_con;
+ }
+
+ pm_scif->pm_recvq = create_singlethread_workqueue("pm_recvq");
+ INIT_WORK(&pm_scif->pm_recv, pm_recv_from_host);
+ queue_work(pm_scif->pm_recvq, &pm_scif->pm_recv);
+ pm_scif->con_state = PM_CONNECTED;
+ err = 0;
+#ifdef PM_SCIF_IOCTL
+done:
+#endif
+ return err;
+end_con:
+ pm_scif_exit();
+ FUNCTION_EXIT;
+ return err;
+}
+EXPORT_SYMBOL(pm_scif_init);
+
+void pm_scif_exit(void)
+{
+ unsigned long flags;
+
+ FUNCTION_ENTRY;
+ PM_DB("Good Bye!, pm scif \n");
+
+ pm_send_to_host(PM_MESSAGE_CLOSE, NULL, 0);
+ write_lock_irqsave(&pmscif_send,flags);
+ atomic_set(&epinuse,1);
+ write_unlock_irqrestore(&pmscif_send,flags);
+
+ if (pm_scif) {
+ if(pm_scif->pm_recvq) {
+ flush_workqueue(pm_scif->pm_recvq);
+ PM_DB("calling destroy\n");
+ destroy_workqueue(pm_scif->pm_recvq);
+ }
+
+ PM_DB("closing ep \n");
+ if (pm_scif->ep)
+ scif_close(pm_scif->ep);
+
+ pm_scif_unregister(&micpmscif);
+ pm_scif->con_state = PM_DISCONNECTED;
+ kfree(pm_scif);
+ }
+ #ifdef PM_SCIF_IOCTL
+ spm_dev_deinit();
+ #endif
+ FUNCTION_EXIT;
+}
+
+EXPORT_SYMBOL(pm_scif_exit);
+
+module_init(pm_scif_init);
+module_exit(pm_scif_exit);
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#if !defined(__PM_SCIF_H)
+#define __PM_SCIF_H
+
+struct mic_pmscif_handle{
+ int (*pm_scif_uos2host)(PM_MESSAGE opcode, void *msg, size_t len);
+ int (*pm_scif_host2uos)(pm_msg_header *header, void *msg);
+ struct module *owner;
+};
+
+extern int pm_scif_register(struct mic_pmscif_handle *pmscif);
+extern void pm_scif_unregister(struct mic_pmscif_handle *pmscif);
+
+#endif //__PM_SCIF_H
--- /dev/null
+obj-m := ramoops.o
--- /dev/null
+/*
+ * RAM Oops/Panic logger
+ *
+ * Copyright (C) 2009 Marco Stornelli <marco.stornelli@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kmsg_dump.h>
+#include <linux/time.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+
+#define RAMOOPS_KERNMSG_HDR "===="
+#define RAMOOPS_HEADER_SIZE (5 + sizeof(struct timeval))
+
+#define RECORD_SIZE 4096
+
+static ulong mem_address;
+module_param(mem_address, ulong, 0600);
+MODULE_PARM_DESC(mem_address,
+ "start of reserved RAM used to store oops/panic logs");
+
+static ulong mem_size;
+module_param(mem_size, ulong, 0600);
+MODULE_PARM_DESC(mem_size,
+ "size of reserved RAM used to store oops/panic logs");
+
+static int dump_oops = 1;
+module_param(dump_oops, int, 0600);
+MODULE_PARM_DESC(dump_oops,
+ "set to 1 to dump oopses, 0 to only dump panics (default 1)");
+
+static struct ramoops_context {
+ struct kmsg_dumper dump;
+ void *virt_addr;
+ phys_addr_t phys_addr;
+ unsigned long size;
+ int count;
+ int max_count;
+} oops_cxt;
+
+static void ramoops_do_dump(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason, const char *s1, unsigned long l1,
+ const char *s2, unsigned long l2)
+{
+ struct ramoops_context *cxt = container_of(dumper,
+ struct ramoops_context, dump);
+ unsigned long s1_start, s2_start;
+ unsigned long l1_cpy, l2_cpy;
+ int res;
+ char *buf;
+ struct timeval timestamp;
+
+ /* Only dump oopses if dump_oops is set */
+ if ((reason != KMSG_DUMP_OOPS) || !dump_oops)
+ return;
+
+ buf = (char *)(cxt->virt_addr + (cxt->count * RECORD_SIZE));
+ memset(buf, '\0', RECORD_SIZE);
+ res = sprintf(buf, "%s", RAMOOPS_KERNMSG_HDR);
+ buf += res;
+ do_gettimeofday(×tamp);
+ res = sprintf(buf, "%lu.%lu\n", (long)timestamp.tv_sec, (long)timestamp.tv_usec);
+ buf += res;
+
+ l2_cpy = min(l2, (unsigned long)(RECORD_SIZE - RAMOOPS_HEADER_SIZE));
+ l1_cpy = min(l1, (unsigned long)(RECORD_SIZE - RAMOOPS_HEADER_SIZE) - l2_cpy);
+
+ s2_start = l2 - l2_cpy;
+ s1_start = l1 - l1_cpy;
+
+ memcpy(buf, s1 + s1_start, l1_cpy);
+ memcpy(buf + l1_cpy, s2 + s2_start, l2_cpy);
+
+ cxt->count = (cxt->count + 1) % cxt->max_count;
+}
+
+static int __init ramoops_init(void)
+{
+ struct ramoops_context *cxt = &oops_cxt;
+ int err = -EINVAL;
+
+ if (!mem_size) {
+ printk(KERN_ERR "Invalid size specification");
+ goto fail3;
+ }
+
+ rounddown_pow_of_two(mem_size);
+
+ if (mem_size < RECORD_SIZE) {
+ printk(KERN_ERR "size too small");
+ goto fail3;
+ }
+
+ cxt->max_count = mem_size / RECORD_SIZE;
+ cxt->count = 0;
+ cxt->size = mem_size;
+ cxt->phys_addr = mem_address;
+
+ if (!request_mem_region(cxt->phys_addr, cxt->size, "ramoops")) {
+ printk(KERN_ERR "ramoops: request mem region failed");
+ err = -EINVAL;
+ goto fail3;
+ }
+
+ cxt->virt_addr = ioremap(cxt->phys_addr, cxt->size);
+ if (!cxt->virt_addr) {
+ printk(KERN_ERR "ramoops: ioremap failed");
+ goto fail2;
+ }
+
+ cxt->dump.dump = ramoops_do_dump;
+ err = kmsg_dump_register(&cxt->dump);
+ if (err) {
+ printk(KERN_ERR "ramoops: registering kmsg dumper failed");
+ goto fail1;
+ }
+
+ return 0;
+
+fail1:
+ iounmap(cxt->virt_addr);
+fail2:
+ release_mem_region(cxt->phys_addr, cxt->size);
+fail3:
+ return err;
+}
+
+static void __exit ramoops_exit(void)
+{
+ struct ramoops_context *cxt = &oops_cxt;
+
+ if (kmsg_dump_unregister(&cxt->dump) < 0)
+ printk(KERN_WARNING "ramoops: could not unregister kmsg_dumper\n");
+
+ iounmap(cxt->virt_addr);
+ release_mem_region(cxt->phys_addr, cxt->size);
+}
+
+
+module_init(ramoops_init);
+module_exit(ramoops_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marco Stornelli <marco.stornelli@gmail.com>");
+MODULE_DESCRIPTION("RAM Oops/Panic logger/driver");
+
--- /dev/null
+obj-m := micras.o
+
+micras-y := micras_main.o micras_common.o
+micras-y += micras_core.o micras_uncore.o micras_elog.o
+micras-$(CONFIG_ML1OM) += micras_knf.o
+micras-$(CONFIG_MK1OM) += micras_knc.o micras_pm.o
--- /dev/null
+#
+# Build RAS drivers
+#
+# In Linux 2.6 kernels modules must be built by the kernel's kbuild
+# system, with a path to the kernel module source directory. Kbuild
+# expects a general purpose Makefile to exist and optionally an extra
+# file named Kbuild with the kernel module build details.
+# This Makefile is a 'backwards compatible' (see file "modules.txt").
+#
+DEBUG = n
+
+ifneq ($(KERNELRELEASE),)
+
+#
+# Kbuild backwards compatibility part:
+# Load Kbuild to specify module targets and options.
+#
+include Kbuild
+
+else
+
+#
+# Standard invocation:
+#
+# Export variables to environment and pass control to kernel tools
+# ARCH Target architecture: l1om or k1om
+# KERNELDIR Top of MIC kernel tree (not repo source tree)
+# DRIVERDIR Top of MPSS drivers build tree (not repo source tree)
+#
+
+ARCH := $(or $(ARCH), $(shell cat $(CURDIR)/../.arch 2>/dev/null))
+
+ifeq ($(DRIVERDIR),)
+ifeq ($(shell /usr/bin/test -d ../source-root/$(ARCH)-hybrid && echo Y),Y)
+DRIVERDIR = $(PWD)/../source-root/$(ARCH)-hybrid
+KERNELDIR ?= $(DRIVERDIR)/card/kernel
+else ifeq ($(shell /usr/bin/test -d ../source-root/$(ARCH)-internal && echo Y),Y)
+DRIVERDIR = $(PWD)/../source-root/$(ARCH)-internal
+KERNELDIR ?= $(DRIVERDIR)/card/kernel
+endif
+endif
+KERNELDIR ?= ../../miclinux
+
+SCIF_SYM = $(DRIVERDIR)/card/driver/Module.symvers
+SCIF_LIB = $(DRIVERDIR)/host/scif_lib
+SCIF_HEADER = $(DRIVERDIR)/include
+
+EXTRA_CFLAGS += $(KERNWARNFLAGS)
+ifeq ($(ARCH),l1om)
+ EXTRA_CFLAGS += -DMIC_IS_L1OM
+else ifeq ($(ARCH),k1om)
+ EXTRA_CFLAGS += -DMIC_IS_K1OM
+else
+ $(error $$(ARCH) must be l1om or k1om)
+endif
+EXTRA_CFLAGS += -DINTERNAL_REG=1 -Wall
+EXTRA_CFLAGS += $(SPOOKY_MIC_CFLAGS)
+
+CROSS_COMPILE = x86_64-$(ARCH)-linux-
+
+ifeq ($(shell which $(CROSS_COMPILE)gcc 2>/dev/null),)
+ ifeq ($(shell which ../cross/bin/$(CROSS_COMPILE)gcc 2>/dev/null),)
+ $(error $$(PATH) must include $(CROSS_COMPILE)gcc)
+ else
+ CROSS_COMPILE = $(PWD)/../cross/bin/x86_64-$(ARCH)-linux-
+ endif
+endif
+
+default: modules tests
+
+modules:
+ @ echo "$(MAKE) -C $(KERNELDIR) M=$(CURDIR) ARCH=$(ARCH) modules"
+ @ $(MAKE) -C $(KERNELDIR) M=$(CURDIR) ARCH=$(ARCH) \
+ V=0 DEBUG=$(DEBUG) \
+ SPOOKY_MIC_CFLAGS=$(SPOOKY_MIC_CFLAGS) \
+ CROSS_COMPILE=$(CROSS_COMPILE) \
+ KBUILD_EXTRA_SYMBOLS=$(SCIF_SYM) \
+ modules
+
+install: modules_install
+
+modules_install:
+ @ echo "$(MAKE) -C $(KERNELDIR) M=$(CURDIR) ARCH=$(ARCH) install"
+ @ $(MAKE) -C $(KERNELDIR) M=$(CURDIR) ARCH=$(ARCH) \
+ V=0 DEBUG=$(DEBUG) \
+ CROSS_COMPILE=$(CROSS_COMPILE) \
+ SPOOKY_MIC_CFLAGS=$(SPOOKY_MIC_CFLAGS) \
+ KBUILD_EXTRA_SYMBOLS=$(SCIF_SYM) \
+ INSTALL_MOD_PATH=$(DESTDIR) \
+ modules_install
+
+#
+# Test programs, expects that compilers and SCIF libraries are present.
+#
+host-tools = edecode gdecode
+host-tests = cp mc ttl tmp cutl proc ukill fan smc fsc pm trbo ptrig cp32 p-in-host p-out-host
+card-tests = p-in-card p-out-card suid load
+
+tests: $(host-tools) $(host-tests) $(card-tests)
+
+cp: cp.c micras_api.h
+ @ echo gcc -O2 cp.c -o cp -lscif
+ @ gcc -O2 cp.c -o cp $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif
+
+cp32: cp32.c micras_api.h
+ @ echo gcc -O2 cp32.c -o cp32 -lscif
+ @ gcc -O2 cp32.c -o cp32 $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif
+
+mc: mc.c micmca_api.h
+ @ echo gcc -O2 mc.c -o mc -lscif
+ @ gcc -O2 mc.c -o mc $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif
+
+cutl: cutl.c micras_api.h
+ @ echo gcc -O2 cutl.c -o cutl -lscif -lncurses
+ @ gcc -O2 cutl.c -o cutl $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif -lncurses
+
+cutl2: cutl2.c micras_api.h
+ @ echo gcc -O2 cutl2.c -o cutl2 -lscif -lncurses
+ @ gcc -O2 cutl2.c -o cutl2 $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif -lncurses
+
+proc: proc.c micras_api.h
+ @ echo gcc -O2 proc.c -o proc -lscif -lncurses
+ @ gcc -O2 proc.c -o proc $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif -lncurses
+
+trbo: trbo.c micras_api.h
+ @ echo gcc -O2 trbo.c -o trbo -lscif
+ @ gcc -O2 trbo.c -o trbo $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif
+
+fan: fan.c micras_api.h
+ @ echo gcc -O2 fan.c -o fan -lscif
+ @ gcc -O2 fan.c -o fan $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif
+
+smc: smc.c micras_api.h
+ @ echo gcc -O2 smc.c -o smc -lscif
+ @ gcc -O2 smc.c -o smc $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif
+
+pm_tst: pm_tst.c micras_api.h
+ @ echo gcc -O2 pm_tst.c -o pm_tst -lscif
+ @ gcc -O2 pm_tst.c -o pm_tst $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif
+
+fsc: fsc.c micras_api.h
+ @ echo gcc -O2 fsc.c -o fsc -lscif
+ @ gcc -O2 fsc.c -o fsc $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif
+
+ptrig: ptrig.c micras_api.h
+ @ echo gcc -O2 ptrig.c -o ptrig -lscif
+ @ gcc -O2 ptrig.c -o ptrig $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif
+
+pm: pm.c micras_api.h micpm_api.h
+ @ echo gcc -O2 pm.c -o pm -lscif
+ @ gcc -O2 pm.c -o pm $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif
+
+tmp: tmp.c micras_api.h
+ @ echo gcc -O2 tmp.c -o tmp -lscif
+ @ gcc -O2 tmp.c -o tmp $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif
+
+ttl: ttl.c micras_api.h micpm_api.h
+ @ echo gcc -O2 ttl.c -o ttl -lscif
+ @ gcc -O2 ttl.c -o ttl $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif
+
+ukill: ukill.c micras_api.h
+ @ echo gcc -O2 ukill.c -o ukill -lscif
+ @ gcc -O2 ukill.c -o ukill $(EXTRA_CFLAGS) -I$(SCIF_HEADER) -L$(SCIF_LIB) -lscif
+
+edecode: edecode.c
+ @ echo gcc -O2 edecode.c -o edecode
+ @ gcc -O2 -Wall edecode.c -o edecode
+
+gdecode: gdecode.c
+ @ echo gcc -O2 gdecode.c -o gdecode
+ @ gcc -O2 -Wall gdecode.c -o gdecode
+
+p-in-host: p-in.c Makefile
+ @ echo gcc -O2 p-in.c -o p-in-host
+ @ gcc -O2 p-in.c -o p-in-host -DIOK=16
+
+p-out-host: p-out.c Makefile
+ @ echo gcc -O2 p-out.c -o p-out-host
+ @ gcc -O2 p-out.c -o p-out-host -DIOK=16 -DTXG=64
+
+suid: suid.c
+ @ echo cross-gcc -O2 suid.c -o suid
+ @ $(CROSS_COMPILE)gcc -O2 suid.c -o suid
+
+p-in-card: p-in.c Makefile
+ @ echo cross-gcc -O2 p-in.c -o p-in-card
+ @ $(CROSS_COMPILE)gcc -O2 p-in.c -o p-in-card -DIOK=64
+
+p-out-card: p-out.c Makefile
+ @ echo cross-gcc -O2 p-out.c -o p-out-card
+ @ $(CROSS_COMPILE)gcc -O2 p-out.c -o p-out-card -DIOK=64 -DTXG=16
+
+load: load.c
+ @ echo cross-gcc load.c -o load -pthread -lpthread
+ @ $(CROSS_COMPILE)gcc load.c -o load $(EXTRA_CFLAGS) -pthread -lpthread
+
+cpptest:
+ @ echo Dumping compiler defines
+ @ echo > nil.c
+ @ $(CROSS_COMPILE)gcc -E -dM nil.c | sort
+ @ rm nil.c
+
+endif
+
+clean:
+ @ echo " Cleaning .."
+ @ rm -fr *.o *~ core .*.sw? .depend .*.cmd *.ko *.mod.c \
+ .tmp_versions modules.order Module.symvers
+ @ rm -f $(host-tools) $(host-tests) $(card-tests)
+
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * Definition of the public MC interface.
+ * Access to MC event features provided through SCIF only.
+ */
+
+#ifndef _MICMCA_API_H_
+#define _MICMCA_API_H_ 1
+
+#ifdef __cplusplus
+extern "C" { /* C++ guard */
+#endif
+
+/*
+ * Configuration manifests
+ */
+
+#pragma pack(push, 4) /* Windows requirement */
+
+
+/*
+ * Machine check info is reported on this port. Only one consumer can
+ * (and must) connect in order to be notified about MC events.
+ */
+
+#define MR_MCE_PORT SCIF_RAS_PORT_1
+
+
+/*
+ * MC events are provide in raw form, i.e. as close to the
+ * contents of MCA register banks as possible. It is not
+ * the responsibility of the MCA event handler to perform
+ * analysis and interpretation of these registers, beyond
+ * determining whether the event was deadly to the uOS.
+ *
+ * Any data or context corruption _IS_ deadly by definition!
+ *
+ * Source identifiers:
+ * org id
+ * 0 Bank 0 CPU #, core event, range 0..CPU_MAX
+ * 1 Bank 1 CPU #, core event, range 0..CPU_MAX
+ * 2 Bank 2 CPU #, core event, range 0..CPU_MAX
+ * 3 DBOX #, uncore event, range 0..DBOX_MAX
+ * 4 SBOX, uncore event, range 0
+ * 5 GBOX #, uncore event, range 0..GBOX_MAX
+ * 6 TBOX #, uncore event, range 0..TBOX_MAX
+ *
+ * Report flags bits (when set) representing:
+ * [31:5] Unused (and reserved)
+ * [4] Filter event, uOS side disabled this event
+ * [3] Status event, no failure (just MCA bank dump)
+ * [2] Injected or artificially generated event
+ * [1] This event has been recorded in EEPROM
+ * [0] Fatal, the uOS is toast (card needs reset)
+ *
+ * MCA bank register sizes are not the same on all banks:
+ *
+ * CTL STATUS ADDR MISC Notes
+ * CPU 0: 32 64 - - A,M not implemented, always 0
+ * CPU 1: 32 64 64 32
+ * CPU 2: 32 64 64 - M not implemented, always 0
+ * DBOX: 32 64 64 - M not implemented, always 0
+ * SBOX: 32 64 64 64
+ * GBOX: 64 64 64 32
+ * TBOX: 64 64 32 - M not implemented, not there
+ */
+
+#define MC_ORG_BNK0 0
+#define MC_ORG_BNK1 1
+#define MC_ORG_BNK2 2
+#define MC_ORG_DBOX 3
+#define MC_ORG_SBOX 4
+#define MC_ORG_GBOX 5
+#define MC_ORG_TBOX 6
+
+#define MC_FLG_FATAL (1 << 0)
+#define MC_FLG_LOG (1 << 1)
+#define MC_FLG_FALSE (1 << 2)
+#define MC_FLG_STATUS (1 << 3)
+#define MC_FLG_FILTER (1 << 4)
+
+typedef struct mce_info {
+ uint16_t org; /* Source of event */
+ uint16_t id; /* Identifier of source */
+ uint16_t flags; /* Report flags */
+ uint16_t pid; /* Alternate source ID */
+ uint64_t stamp; /* Time stamp of event */
+ uint64_t ctl; /* MCA bank register 'CTL' */
+ uint64_t status; /* MCA bank register 'STATUS' */
+ uint64_t addr; /* MCA bank register 'ADDR' */
+ uint64_t misc; /* MCA bank register 'MISC' */
+} MceInfo;
+
+
+#pragma pack(pop) /* Restore to entry conditions */
+
+#ifdef __cplusplus
+} /* C++ guard */
+#endif
+
+#endif /* Recursion block */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * Definition of the PM interface to the RAS module.
+ *
+ * Throttle event interface is similar to the MC interface.
+ * If a connection is made to MR_TTL_PORT then event records
+ * will be sent to the host. Events are sent non-blocking,
+ * so if the SCIF buffer runs full, events are dropped until
+ * the block disappear (or the session is closed).
+ *
+ * Queries are technically implemented as an extension to the
+ * MT interface, and thus are accessible from the host.
+ * Except for the risk of conflicting commands written to the
+ * two power limit registers, there are no side effects from
+ * host side access via SCIF.
+ *
+ * Currently there are no plans to expose this in SysFs nodes.
+ * These routines are just wrappers for read/write access to
+ * SMC registers. No precious IP here.
+ */
+
+#ifndef _MICPM_API_H_
+#define _MICPM_API_H_ 1
+
+#ifdef __cplusplus
+extern "C" { /* C++ guard */
+#endif
+
+
+/*
+**
+** Configuration manifests
+**
+*/
+
+#pragma pack(push, 4) /* Weird Windos requirement */
+
+
+/*
+ * Throttle events are reported on this port. Only one consumer can
+ * connect in order to be notified about PM throttling events.
+ */
+
+#define MR_TTL_PORT SCIF_RAS_PORT_2
+
+
+/*
+ * Throttle events are provided in raw form, i.e. with as
+ * little processing on the card side as possible.
+ * For nicer throttle state display, use MT command MR_REQ_TTL.
+ *
+ * To compensate for the chance of lost events, the full
+ * throttle state is transfered in one byte on every message:
+ *
+ * Bit# Content
+ * 0 Power trottle state changed
+ * 1 New/Current power throttle state
+ * 2 Thermal throttle state changed
+ * 3 New/Current thermal throttle state
+ * 4 Power alert state changed
+ * 5 New/Current power alert state
+ *
+ * By definition, when power and thermal throttle are in effect
+ * the KnC is forced to run at reduced speed (600 MHz or so) and
+ * with lower operating voltages, i.e. software is not in control.
+ * During power alerts the KnC is consuming more power than PLim1
+ * and the PM module can reduce speed and/or voltages to reduce
+ * power consumption. If power consumption goes beyond PLim0, the
+ * hardware (SMC really) will start real power throttles.
+ * In effect time spent in power throttle, will also be counted
+ * as being in the power alert state. See MT request MR_REQ_TTL.
+ */
+
+#define PM_PWR_TTL_CHG (1 << 0) /* Power throttle change */
+#define PM_PWR_TTL (1 << 1) /* Power Trottle state */
+#define PM_TRM_TTL_CHG (1 << 2) /* Thermal throttle change */
+#define PM_TRM_TTL (1 << 3) /* Thermal Trottle state */
+#define PM_ALRT_TTL_CHG (1 << 4) /* Power alert change */
+#define PM_ALRT_TTL (1 << 5) /* Power alert state */
+
+typedef struct ttl_info {
+ uint8_t upd; /* Throttle state update */
+ uint8_t die; /* Die temperature (as per SBOX) */
+} TtlInfo;
+
+
+
+/*
+ * PM specific MT opcodes
+ * Leave one empty slot in callout table between
+ * this and the official MT API entries.
+ */
+
+#define PM_REQ_PL0 (MR_REQ_MAX + 2) /* Get power limit 0 */
+#define PM_SET_PL0 (MR_REQ_MAX + 3) /* Set power limit 0 */
+#define PM_REQ_PL1 (MR_REQ_MAX + 4) /* Get power limit 1 */
+#define PM_SET_PL1 (MR_REQ_MAX + 5) /* Set power limit 1 */
+#define PM_REQ_PAVG (MR_REQ_MAX + 6) /* Get average power */
+#define PM_REQ_PTTL (MR_REQ_MAX + 7) /* Get power throttle */
+#define PM_REQ_VOLT (MR_REQ_MAX + 8) /* Get voltage */
+#define PM_REQ_TEMP (MR_REQ_MAX + 9) /* Get temperatures */
+#define PM_REQ_TACH (MR_REQ_MAX + 10) /* Get fan tachometer */
+#define PM_REQ_TTTL (MR_REQ_MAX + 11) /* Get thermal throttle */
+#define PM_REQ_FTTL (MR_REQ_MAX + 12) /* Get force throttle */
+#define PM_SET_FTTL (MR_REQ_MAX + 13) /* Set force throttle */
+#define PM_REQ_MAX PM_SET_FTTL /* Last PM command */
+
+
+/*
+**
+** Response container structures below.
+**
+*/
+
+
+/*
+ * Get power limit
+ * REQ_PL{0/1} notes:
+ * - Only power limit 0 have a guard band defined.
+ */
+typedef struct pm_rsp_plim {
+ uint32_t pwr_lim; /* Power limit, in Watt */
+ uint32_t time_win; /* Time Window, in mSec */
+ uint32_t guard_band; /* Guard band, in Watt */
+} PmRspPlim;
+
+
+/*
+ * Set power limit
+ */
+typedef struct pm_cmd_plim {
+ uint32_t pwr_lim; /* Power limit, in Watt */
+ uint32_t time_win; /* Time Window, in mSec */
+} PmCmdPlim;
+
+
+/*
+ * Get average power
+ * REQ_PAVG notes:
+ * - Both values are subject to availability in the SMC.
+ * The top two status bit of each SMC register is provided
+ * separately (and stripped from the read value). Decode as
+ * 00 Data OK
+ * 01 Lower threshold reached
+ * 10 Upper threshold reached
+ * 11 Data unavailable
+ * It is unclear if data is good if outside thresholds.
+ */
+typedef struct pm_rsp_pavg {
+ uint8_t stat_0; /* Status bits for window 0 */
+ uint8_t stat_1; /* Status bits for window 1 */
+ uint32_t pwr_0; /* Average over window 0, in Watt */
+ uint32_t pwr_1; /* Average over window 1, in Watt */
+} PmRspPavg;
+
+
+/*
+ * Get Power throttle status
+ * REQ_PTTL notes:
+ * - Duration value is subject to availability in the SMC.
+ * The top two status bit of this SMC register is provided
+ * separately (and stripped from the read value). Decode as
+ * 00 Data OK
+ * 01 Reserved
+ * 10 Reserved
+ * 11 Data unavailable
+ */
+typedef struct pm_rsp_pttl {
+ uint8_t pwr_ttl; /* Power throttle asserted */
+ uint8_t stat_dur; /* Status bits duration */
+ uint32_t duration; /* Power throttle duration, in mSec */
+} PmRspPttl;
+
+
+/*
+ * Get voltages
+ * REQ_VOLT notes:
+ * - VR values are subject to availability in the SMC.
+ * The top two status bit of each SMC register is provided
+ * separately (and stripped from the read value). Decode as
+ * 00 Data OK
+ * 01 Lower threshold reached
+ * 10 Upper threshold reached
+ * 11 Data unavailable
+ * It is unclear if data is good if outside thresholds.
+ */
+typedef struct pm_rsp_volt {
+ uint8_t stat_vccp; /* Status bits for Vddc */
+ uint8_t stat_vddg; /* Status bits for Vddg */
+ uint8_t stat_vddq; /* Status bits for Vddq */
+ uint32_t vccp; /* Vccp, in mV */
+ uint32_t vddg; /* Vddg, in mV */
+ uint32_t vddq; /* Vddq, in mV */
+} PmRspVolt;
+
+
+/*
+ * Get temperatures
+ * REQ_TEMP notes:
+ * - These values are subject to availability in the SMC.
+ * The top two status bit of each SMC register is provided
+ * separately (and stripped from the read value). Decode as
+ * 00 Data OK
+ * 01 Lower threshold reached
+ * 10 Upper threshold reached
+ * 11 Data unavailable
+ * It is unclear if data is good if outside thresholds.
+ */
+typedef struct pm_rsp_temp {
+ uint8_t stat_cpu; /* Status bits for Tcpu */
+ uint8_t stat_vccp; /* Status bits for Tvddc */
+ uint8_t stat_vddg; /* Status bits for Tvddg */
+ uint8_t stat_vddq; /* Status bits for Tvddq */
+ uint32_t cpu; /* CPU temp, in C */
+ uint32_t vccp; /* Vccp VR temp, in C */
+ uint32_t vddg; /* Vddg VR temp, in C */
+ uint32_t vddq; /* Vddq VR temp, in C */
+} PmRspTemp;
+
+
+/*
+ * Get fan tachometer
+ * REQ_TACH notes:
+ * - These values are subject to availability in the SMC.
+ * The top two status bit of each SMC register is provided
+ * separately (and stripped from the read value). Decode as
+ * 00 Data OK
+ * 01 Lower threshold reached (tach only)
+ * 10 Reserved
+ * 11 Data unavailable
+ * It is unclear if data is good if outside thresholds.
+ */
+typedef struct pm_rsp_tach {
+ uint8_t stat_pwm; /* Status bits for PWM */
+ uint8_t stat_tach; /* Status bits for TACH */
+ uint32_t fan_pwm; /* Fan power, in % */
+ uint32_t fan_tach; /* Fan speed, in RPM */
+} PmRspTach;
+
+
+/*
+ * Get thermal throttle status
+ * REQ_THRM notes:
+ * - Duration value is subject to availability in the SMC.
+ * The top two status bit of this SMC register is provided
+ * separately (and stripped from the read value). Decode as
+ * 00 Data OK
+ * 01 Reserved
+ * 10 Reserved
+ * 11 Data unavailable
+ */
+typedef struct pm_rsp_tttl {
+ uint8_t thrm_ttl; /* Power throttle asserted */
+ uint8_t stat_dur; /* Status bits duration */
+ uint32_t duration; /* Thermal throttle duration, in mSec */
+} PmRspTttl;
+
+
+/*
+ * Get/Set force trottle control
+ */
+typedef struct pm_rsp_fttl {
+ uint8_t forced; /* Forced power throttle asserted */
+} PmRspFttl;
+
+
+#pragma pack(pop) /* Restore to sane conditions */
+
+#ifdef __cplusplus
+} /* C++ guard */
+#endif
+
+#endif /* Recursion block */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * RAS module common internal declarations
+ *
+ * Configuration flags, constants and function prototypes
+ * for the RAS sysfs, MT and MC module.
+ */
+
+#ifndef _MICRAS_H_
+#define _MICRAS_H_ 1
+
+
+/*
+ * Public APIs first.
+ * Must be self-contained and independent of local tunables.
+ */
+
+#include "micras_api.h"
+#include "micmca_api.h"
+#include "micpm_api.h"
+
+
+/*
+ * Local configurables & tunables
+ */
+
+#define USE_PM 1 /* Support power management */
+
+#define RAS_HALT 1 /* Panic on uncorrectable MCAs */
+
+#define I2C_SLOW 1 /* Default to lowest speed on I2C */
+
+#define USE_FSC 1 /* Allow using FSC MGBR/MGBSR protocol */
+#define USE_SVID 0 /* Allow using SVID for VR info */
+#define USE_SMC 1 /* Prefer SMC over SBOX (telemetry) */
+
+#define MT_TIMER 1 /* Enable periodic wakeup */
+#define MT_PERIOD 999 /* Period sleep (mS) */
+
+#define MCU_NMI 1 /* Use NMI in SBOX redirection table */
+
+#define EE_VERIFY 0 /* Verify all EEPROM writes */
+#define EE_PROC 1 /* Enable access to EEPROM from /proc/elog */
+#define EE_PROC_NEW 0 /* Only display events between head & tail */
+#define EE_INJECT 0 /* Enable writes to EEPROM via /proc/elog */
+
+#define BEAM_TEST 0 /* Neuter MC handling for beam test */
+
+#define MT_VERBOSE 0 /* Track MT activity in kernel log */
+#define MC_VERBOSE 0 /* Track MC activity in kernel log */
+#define PM_VERBOSE 0 /* Track PM activity in kernel log */
+
+#define GBOX_WORKING 0 /* Set to one when GBOX writes are stable */
+
+#define WA_4845465 0 /* Use HSD #4845465 workaround */
+
+#define ADD_DIE_TEMP 1 /* Embed die temperature in event reports */
+
+#define NOT_YET 0 /* 'Hide' code that's not currently in use */
+
+
+/*
+ * Useful macros
+ *TBD: Cast everything to 64 bit (ULL)?
+ * For now all is 32 bit (U)
+ */
+
+#define GET_BITS(l,r,v) (((v) >> (r)) & ((1U << ((l) - (r) +1)) -1))
+#define PUT_BITS(l,r,v) (((v) & ((1U << ((l) - (r) +1)) -1)) << (r))
+
+#define GET_BIT(n,v) GET_BITS((n), (n), (v))
+#define PUT_BIT(n,v) PUT_BITS((n), (n), (v))
+
+
+/*
+ * Init/Exit functions
+ */
+
+extern void mr_mt_init(void);
+extern void mr_mt_exit(void);
+extern void mr_mt_card_init(void);
+extern void mr_mt_card_exit(void);
+
+
+/*
+ * Command line options (exported from generic MCE handler)
+ */
+
+extern int mce_disabled;
+
+
+/*
+ * MT opcode/function table.
+ * Resides in micras_main() and gates access though sysctls and SCIF.
+ */
+
+struct fnc_tab {
+ uint16_t cmd;
+ uint8_t simple;
+ uint8_t privileged;
+ int (*fnc)(void *);
+};
+
+extern int micras_priv;
+extern int micras_mt_call(uint16_t, void *);
+
+
+/*
+ * MT get functions
+ * Spread over micras_{common,knf,knc}.c
+ */
+extern int mr_get_hwinf(void *);
+extern int mr_get_vers(void *);
+extern int mr_get_pver(void *);
+extern int mr_get_freq(void *);
+extern int mr_get_volt(void *);
+extern int mr_get_power(void *);
+extern int mr_get_plim(void *);
+extern int mr_get_clst(void *);
+extern int mr_get_gddr(void *);
+extern int mr_get_gfreq(void *);
+extern int mr_get_gvolt(void *);
+extern int mr_get_temp(void *);
+extern int mr_get_fan(void *);
+extern int mr_get_ecc(void *);
+extern int mr_get_trc(void *);
+extern int mr_get_trbo(void *);
+extern int mr_get_oclk(void *);
+extern int mr_get_cutl(void *);
+extern int mr_get_mem(void *);
+extern int mr_get_os(void *);
+extern int mr_get_proc(void *);
+extern int mr_get_pmcfg(void *);
+
+/*
+ * MT set functions
+ * Spread over micras_{common,knf,knc}.c
+ */
+extern int mr_set_freq(void *);
+extern int mr_set_volt(void *);
+extern int mr_set_plim(void *);
+extern int mr_set_gfreq(void *);
+extern int mr_set_gvolt(void *);
+extern int mr_set_fan(void *);
+extern int mr_set_trc(void *);
+extern int mr_set_trbo(void *);
+extern int mr_set_oclk(void *);
+
+
+/*
+ * MT cmd functions
+ */
+extern int mr_cmd_pkill(void *);
+extern int mr_cmd_ukill(void *);
+
+
+#if defined(CONFIG_ML1OM) && USE_FSC
+/*
+ * MT FSC access functions
+ * KnF specific, located in micras_knf.c
+ */
+extern int mr_get_fsc(void *);
+extern int mr_set_fsc(void *);
+#endif
+
+#if defined(CONFIG_MK1OM)
+/*
+ * MT SMC access functions
+ * KnC specific, located in micras_knc.c
+ */
+extern int mr_get_smc(void *);
+extern int mr_get_led(void *);
+extern int mr_get_prochot(void *);
+extern int mr_get_pwralt(void *);
+extern int mr_get_perst(void *);
+extern int mr_get_ttl(void *);
+
+extern int mr_set_smc(void *);
+extern int mr_set_led(void *);
+extern int mr_set_prochot(void *);
+extern int mr_set_pwralt(void *);
+extern int mr_set_perst(void *);
+#endif
+
+
+#if defined(CONFIG_MK1OM) && USE_PM
+/*
+ * PM get functions
+ */
+extern int pm_get_pl0(void *);
+extern int pm_get_pl1(void *);
+extern int pm_get_pavg(void *);
+extern int pm_get_pttl(void *);
+extern int pm_get_volt(void *);
+extern int pm_get_temp(void *);
+extern int pm_get_tach(void *);
+extern int pm_get_tttl(void *);
+extern int pm_get_fttl(void *);
+
+/*
+ * PM set functions
+ */
+extern int pm_set_pl0(void *);
+extern int pm_set_pl1(void *);
+extern int pm_set_fttl(void *);
+#endif
+
+
+/*
+ * MC & TTL event distribution functions
+ * Spread over micras_{main,elog,core}.c
+ */
+
+#ifdef MR_MCE_PORT
+extern int micras_mc_send(struct mce_info *, int);
+extern void micras_mc_ipmi(struct mce_info *, int);
+extern void micras_mc_log(struct mce_info *);
+extern uint32_t micras_mc_filter(struct mce_info *, uint64_t, int);
+#endif
+#ifdef MR_TTL_PORT
+extern void micras_ttl_send(struct ttl_info *);
+#endif
+
+
+/*
+ * BOX constants (card variations).
+ */
+
+#ifdef CONFIG_ML1OM
+#define DBOX_NUM 1
+#define GBOX_NUM 4
+#endif
+
+#ifdef CONFIG_MK1OM
+#define DBOX_NUM 2
+#define GBOX_NUM 8 /* Max count, SKU dependent */
+#define TBOX_NUM 8 /* Max count, SKU dependent */
+#endif
+
+#ifndef COMMON_MMIO_BOX_SIZE
+#define COMMON_MMIO_BOX_SIZE (1<<16)
+#endif
+
+
+/*
+ * BOX utility functions
+ * Most located in micras_main.c
+ */
+
+extern char *mr_sku(void);
+extern int mr_mch(void);
+extern int mr_txs(void);
+
+extern uint8_t *micras_sbox;
+extern uint8_t *micras_dbox[DBOX_NUM];
+extern uint8_t *micras_gbox[GBOX_NUM];
+#ifdef CONFIG_MK1OM
+extern uint8_t *micras_tbox[TBOX_NUM];
+#endif
+
+extern uint8_t *mr_sbox_base(int);
+extern uint32_t mr_sbox_rl(int, uint32_t);
+extern void mr_sbox_wl(int, uint32_t, uint32_t);
+extern uint64_t mr_sbox_rq(int, uint32_t);
+extern void mr_sbox_wq(int, uint32_t, uint64_t);
+
+extern uint8_t *mr_dbox_base(int);
+extern uint32_t mr_dbox_rl(int, uint32_t);
+extern void mr_dbox_wl(int, uint32_t, uint32_t);
+extern uint64_t mr_dbox_rq(int, uint32_t);
+extern void mr_dbox_wq(int, uint32_t, uint64_t);
+
+extern uint8_t *mr_gbox_base(int);
+extern uint32_t mr_gbox_rl(int, uint32_t);
+extern void mr_gbox_wl(int, uint32_t, uint32_t);
+extern uint64_t mr_gbox_rq(int, uint32_t);
+extern void mr_gbox_wq(int, uint32_t, uint64_t);
+
+#ifdef CONFIG_MK1OM
+extern uint8_t *mr_tbox_base(int);
+extern uint32_t mr_tbox_rl(int, uint32_t);
+extern void mr_tbox_wl(int, uint32_t, uint32_t);
+extern uint64_t mr_tbox_rq(int, uint32_t);
+extern void mr_tbox_wq(int, uint32_t, uint64_t);
+#endif
+
+
+/*
+ * Un-core MCA register offsets
+ * Some #defines stolen from FreeBSD uOS.
+ *
+ *TBD: check again when we get real register include files
+ */
+
+#define SBOX_MCX_CTL_LO 0x00003090
+#define SBOX_MCX_STATUS_LO 0x00003098
+#define SBOX_MCX_STATUS_HI 0x0000309C
+#define SBOX_MCX_ADDR_LO 0x000030A0
+#define SBOX_MCX_ADDR_HI 0x000030A4
+#define SBOX_MCX_MISC 0x000030A8
+#define SBOX_MCX_MISC2 0x000030AC
+#define SBOX_MCA_INT_STAT 0x0000AB00
+#define SBOX_MCA_INT_EN 0x0000AB04
+#define SBOX_COMPONENT_ID 0x00004134
+
+#define DBOX_MC2_CTL 0x00000340
+#define DBOX_MC2_STATUS 0x00000348
+#define DBOX_MC2_ADDR 0x00000350
+
+#define GBOX_FBOX_MCA_CTL_LO 0x0000005C
+#define GBOX_FBOX_MCA_CTL_HI 0x00000060
+#define GBOX_FBOX_MCA_STATUS_LO 0x00000064
+#define GBOX_FBOX_MCA_STATUS_HI 0x00000068
+#define GBOX_FBOX_MCA_ADDR_LO 0x0000006C
+#define GBOX_FBOX_MCA_ADDR_HI 0x00000070
+#define GBOX_FBOX_MCA_MISC 0x00000074
+
+#ifdef CONFIG_MK1OM
+#define TXS_MCX_CONTROL 0x00003700
+#define TXS_MCX_STATUS 0x00003740
+#define TXS_MCX_ADDRESS 0x00003780
+#endif
+
+
+/*
+ * Thermal register offsets
+ */
+
+#if defined(CONFIG_MK1OM) && WA_4845465
+#ifndef SBOX_MICROCONTROLLER_FAN_STATUS
+#define SBOX_MICROCONTROLLER_FAN_STATUS 0x1020
+#endif
+#endif
+#if defined(CONFIG_MK1OM) && (WA_4845465 || ADD_DIE_TEMP || USE_PM)
+#ifndef SBOX_THERMAL_STATUS_2
+#define SBOX_THERMAL_STATUS_2 0x1080
+#endif
+#endif
+
+
+/*
+ * SMP utilities
+ * Located in micras_main.c
+ */
+
+extern uint32_t rd_cr4_on_cpu(int);
+extern void set_in_cr4_on_cpu(int, uint32_t);
+extern void clear_in_cr4_on_cpu(int, uint32_t);
+extern uint64_t rdtsc(void);
+
+
+/*
+ * General EEPROM and POST card UART access
+ * Located in micras_elog.c
+ */
+
+#define EE_BUF_COUNT 100
+#define EE_BUF_LINELEN 256
+extern char ee_buf[];
+extern atomic_t ee_msg;
+extern atomic_t ee_seen;
+
+extern char * ee_fmt(char *, va_list);
+extern int ee_printk(char *, ...);
+extern int ee_print(char *, ...);
+#ifdef CONFIG_MK1OM
+extern void ee_list(void);
+extern void ee_wipe(void);
+#endif
+extern int ee_init(void);
+extern int ee_exit(void);
+
+extern void myDELAY(uint64_t);
+
+
+/*
+ * SMC access API
+ * Provided by the kernel
+ */
+
+extern int gmbus_i2c_read(uint8_t, uint8_t, uint8_t, uint8_t *, uint16_t);
+extern int gmbus_i2c_write(uint8_t, uint8_t, uint8_t, uint8_t *, uint16_t);
+
+
+/*
+ * RAS core MCA handling
+ * Located in micras_core.c
+ */
+
+extern uint8_t xlat_cpu[NR_CPUS];
+extern void mcc_sync(void);
+extern int mcc_init(void);
+extern int mcc_exit(void);
+extern void mcc_flt_parm(uint8_t *);
+
+
+/*
+ * RAS un-core MCA handling
+ * Located in micras_uncore.c
+ */
+
+extern void box_reset(int);
+extern int mcu_init(void);
+extern int mcu_exit(void);
+
+
+#if defined(CONFIG_MK1OM) && USE_PM
+/*
+ * RAS PM handling
+ * Located in micras_pm.c
+ *
+ * Power management registration exchange records:
+ * The RAS module populates a 'params' record and pass it to
+ * the PM module through the micpm_ras_register() function.
+ * In return the PM module populate the passed 'callbacks' record.
+ * The PM module is responsible for populating the lists of
+ * supported core frequencies and core voltages. In contrast to
+ * KnF, where the lists reflect the hardware capabilities, these
+ * reflect the actual frequencies and voltages that core-freq
+ * module can use to lower power consumption.
+ */
+
+struct micpm_params {
+ uint32_t * freq_lst; /* Core frequency list */
+ uint32_t * freq_len; /* Core freq count */
+ uint32_t freq_siz; /* Space in core freq list */
+ uint32_t * volt_lst; /* Core voltage list */
+ uint32_t * volt_len; /* Core voltage count */
+ uint32_t volt_siz; /* Space in core volt list */
+ int (* mt_call)(uint16_t, void *); /* Access MT function */
+ void (* mt_ttl)(int, int); /* Throttle notifier */
+};
+
+struct micpm_callbacks {
+ int (*micpm_get_turbo)(void); /* Get PM turbo setting */
+ void (*micpm_set_turbo)(int); /* Notify PM of new turbo setting */
+ void (*micpm_vf_refresh)(void); /* Refresh core V/F lists */
+ int (*micpm_get_pmcfg)(void); /* Get PM operating mode */
+};
+
+extern struct micpm_params pm_reg;
+extern struct micpm_callbacks pm_cb;
+
+
+/*
+ * Args for mt_ttl() function
+ */
+
+#define TTL_OFF 0
+#define TTL_ON 1
+
+#define TTL_POWER 0
+#define TTL_THERMAL 1
+
+
+/*
+ * Bit locations for micpm_get_turbo() and micpm_set_turbo()
+ */
+
+#define MR_PM_MODE (1 << 0) /* Turbo mode */
+#define MR_PM_STATE (1 << 1) /* Current turbo state */
+#define MR_PM_AVAIL (1 << 2) /* Turbo mode available */
+
+
+/*
+ * Bit positions for the different features turned on/off
+ * in the uOS PM configuration, for micpm_get_pmcfg().
+ */
+
+#define PMCFG_PSTATES_BIT 0
+#define PMCFG_COREC6_BIT 1
+#define PMCFG_PC3_BIT 2
+#define PMCFG_PC6_BIT 3
+
+
+/*
+ * Register/Unregister functions in micpm driver that RAS calls
+ * during module init/exit. Pointers to the exchanged data
+ * structures are passed during registration.
+ * The RAS module guarantee that the pointers are valid until
+ * the unregister function is called. That way the PM module can
+ * modify the core frequency/voltage lists if they gets changed.
+ * The callbacks must always either be a valid function pointer
+ * or a null pointer.
+ */
+
+extern int micpm_ras_register(struct micpm_callbacks *, struct micpm_params *);
+extern void micpm_ras_unregister(void);
+
+extern int mr_pm_ttl(struct mr_rsp_ttl *);
+extern int pm_init(void);
+extern void pm_exit(void);
+#endif
+
+
+/*
+ * Debug tools
+ */
+
+extern void dmp_hex(void *, int, const char *, ...);
+
+#endif /* Recursion block */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * Definition of the public RAS Monitoring Thread interface.
+ * Access to RAS features are expected from SCIF and through
+ * nodes under '/sys/class/micras'. Both interfaces ends up
+ * in the same code and thus present the exact same data.
+ *
+ * Some information that are available elsewhere through standard
+ * Linux mechanism are included in this API, though things like
+ * process status (/proc/<pid>/stat), cpu status (/proc/stat),
+ * and memory status (/proc/vmstat) are better from the source.
+ */
+
+#ifndef _MICRAS_API_H_
+#define _MICRAS_API_H_ 1
+
+#ifdef __cplusplus
+extern "C" { /* C++ guard */
+#endif
+
+/*
+**
+** Configuration manifests
+**
+*/
+
+#pragma pack(push, 4) /* Windos requirement */
+
+
+/*
+ * RAS module version info: M.NP
+ */
+
+#define RAS_MAJOR "1"
+#define RAS_MINOR "0"
+#define RAS_PATCH " "
+#define RAS_VER RAS_MAJOR "." RAS_MINOR RAS_PATCH
+
+
+/*
+ * RAS services in uOS kernel listens on this port for incoming queries.
+ * Consumers may establish multiple connections to this port, though no
+ * guarantee on connection processing order will be given. Transactions
+ * on a connection will be processed and replied to in order recieved.
+ */
+
+#define MR_MON_PORT SCIF_RAS_PORT_0
+#define MR_SCIF_MAX 32
+
+
+/*
+ * Some array max sizes.
+ * These may be replaced by system wide constants
+ * if they become available in the source tree.
+ */
+
+#define MR_VERS_LEN 120 /* Version string lengths */
+#define MR_GUID_LEN 16 /* Global unique ID length (bytes) */
+#define MR_SENO_LEN 12 /* Serial number length (bytes) */
+#define MR_PVER_LEN 8 /* API version string length */
+#define MR_PTAB_LEN 64 /* PM freq/volt pairs */
+#define MR_DIES_LEN 9 /* Die temperatures */
+#define MR_BRDS_LEN 4 /* Board temp sensors */
+#define MR_GVND_LEN 16 /* GDDR vendor string length */
+#define MR_CORE_LEN 62 /* Max number of CPU cores */
+
+
+/*
+** Transaction header for requests and responses is a fixed size
+** record followed by an optional variable length data block.
+**
+** Fields usage:
+** cmd [15] data field is error record
+** cmd [14] response to opcode
+** cmd [13:0] opcode
+** len length of payload
+** parm command parameter
+** stamp host side cookie, performance monitoring
+** spent processing time, performance monitoring
+**
+** Command codes:
+** Codes that directly relate to cores may set the 'parm' field to a
+** non-zero value to address one core (base 1) instead of them all.
+**
+*/
+
+typedef struct mr_hdr {
+ uint16_t cmd; /* Command field */
+ uint16_t len; /* Size of data payload */
+ uint32_t parm; /* Parameter field */
+ uint64_t stamp; /* Time stamp of 'send' (set by host) */
+ uint64_t spent; /* Time used on response (rdtsc delta) */
+} MrHdr;
+
+#define MR_RESP (1 << 14) /* Response bit */
+#define MR_ERROR (1 << 15) /* Error bit */
+#define MR_OP_MASK (MR_RESP - 1) /* Opcode mask */
+
+#define MR_REQ_HWINF 1 /* Get hardware info */
+#define MR_REQ_VERS 2 /* Get version strings */
+#define MR_REQ_CFREQ 3 /* Get core frequencies */
+#define MR_SET_CFREQ 4 /* Set core frequency */
+#define MR_REQ_CVOLT 5 /* Get core voltages */
+#define MR_SET_CVOLT 6 /* Set core voltage */
+#define MR_REQ_PWR 7 /* Get power metrics */
+#define MR_REQ_PLIM 8 /* Get power limit */
+#define MR_SET_PLIM 9 /* Set power limit */
+#define MR_REQ_CLST 10 /* Get core list */
+#define MR_ENB_CORE 11 /* Enable core */
+#define MR_DIS_CORE 12 /* Disable core */
+#define MR_REQ_GDDR 13 /* Get GDDR device info */
+#define MR_REQ_GFREQ 14 /* Get GDDR frequencies */
+#define MR_SET_GFREQ 15 /* Set GDDR frequency */
+#define MR_REQ_GVOLT 16 /* Get GDDR voltages */
+#define MR_SET_GVOLT 17 /* Set GDDR voltage */
+#define MR_REQ_TEMP 18 /* Get board temperatures */
+#define MR_REQ_FAN 19 /* Get fan status */
+#define MR_SET_FAN 20 /* Set fan power */
+#define MR_REQ_ECC 21 /* Get ECC mode */
+#define MR_SET_ECC 22 /* Set ECC mode */
+#define MR_REQ_TRC 23 /* Get debug trace level */
+#define MR_SET_TRC 24 /* Set debug trace level */
+#define MR_REQ_TRBO 25 /* Get turbo mode status */
+#define MR_SET_TRBO 26 /* Set turbo mode status */
+#define MR_REQ_OCLK 27 /* Get overclocking status */
+#define MR_SET_OCLK 28 /* Set overclocking status */
+#define MR_REQ_CUTL 29 /* Get core utilization */
+#define MR_REQ_MEM 30 /* Get memory utilization */
+#define MR_REQ_OS 31 /* Get OS status & process list */
+#define MR_REQ_PROC 32 /* Get process details */
+#define MR_REQ_THRD 33 /* Get thread details */
+#define MR_REQ_PVER 34 /* Get API version */
+#define MR_CMD_PKILL 35 /* Kill process */
+#define MR_CMD_UKILL 36 /* Kill processes owned by user */
+#define MR_GET_SMC 37 /* Get SMC register */
+#define MR_SET_SMC 38 /* Write SMC register */
+#define MR_REQ_PMCFG 39 /* Get PM config mode */
+#define MR_REQ_LED 40 /* Get LED mode */
+#define MR_SET_LED 41 /* Set LED mode */
+#define MR_REQ_PROCHOT 42 /* Get PROC hot trigger */
+#define MR_SET_PROCHOT 43 /* Set PROC hot trigger */
+#define MR_REQ_GPUHOT 42 /* Get GPU hot trigger */
+#define MR_SET_GPUHOT 43 /* Set GPU hot trigger */
+#define MR_REQ_PWRALT 44 /* Get power alert trigger */
+#define MR_SET_PWRALT 45 /* Set power alert trigger */
+#define MR_REQ_PERST 46 /* Get persistent triggers flag */
+#define MR_SET_PERST 47 /* Set persistent triggers flag */
+#define MR_REQ_TTL 48 /* Get Throttle state */
+#define MR_REQ_MAX 48 /* Max command code */
+
+
+/*
+**
+** Transaction error record:
+** If an error occurs during the handling of a request, an
+** error record is returned, possibly with supplemental info.
+**
+** Fields usage:
+** err code indication error condition
+** len size of additional data
+**
+** For now there is no definition on what supplemental info
+** should look like, but the idea is to open for a possibility
+** of giving very precise specification on what the error was.
+** Consider it a place holder for future use.
+**
+** Error codes:
+** Code 'NOMEM' means that space for response generation was unavailable.
+** Code 'NOVAL' is used to indicate that a valid request (i.e. a query
+** on something temporarily unavailable, like processor utilization on
+** a core in a sleep state) has no valid response.
+**
+*/
+
+typedef struct mr_err {
+ uint16_t err; /* Error code field */
+ uint16_t len; /* Length of additional error info */
+} MrErr;
+
+#define MR_ERR_INVOP 1 /* Dofus, command/opcode invalid */
+#define MR_ERR_INVLEN 2 /* Dofus, length not valid for opcode */
+#define MR_ERR_INVAUX 3 /* Dofus, parm field not valid for opcode */
+#define MR_ERR_INVDATA 4 /* Dofus, content of data block invalid */
+#define MR_ERR_PERM 5 /* Failure, privileged command */
+#define MR_ERR_NOMEM 6 /* Failure, out of memory */
+#define MR_ERR_SMC 7 /* Failure, SMC communication */
+#define MR_ERR_NOVAL 8 /* Failure, no valid value to report */
+#define MR_ERR_UNSUP 9 /* Failure, not implemented (temporary) */
+#define MR_ERR_RANGE 10 /* Failure, parameter out of range */
+#define MR_ERR_PEND 11 /* Pending, internal use only */
+
+
+/*
+**
+** Response container structures below.
+**
+** Strings are returned in Pascal format (why?), i.e. pre-fixed
+** with a 1 byte length field and post-fixed with a 0 byte.
+**
+*/
+
+
+/*
+ * MIC Hardware Info
+ * REQ_HWINF Notes:
+ * - no idea how to determine PCI-E slot, it's a host side thing.
+ * - assume revision is same as model ID in the component ID register
+ * - unique ID not available in all flash versions
+ * - Hardware version codes are reported as-is, anticipating
+ * recipient to know what the codes means.
+ */
+
+typedef struct mr_rsp_hwinf {
+ uint8_t guid[MR_GUID_LEN]; /* Unique ID, from SMC */
+ uint8_t board; /* Board type, SMC HW 17:16 */
+ uint8_t fab; /* Fab version, SMC HW 10:8 */
+ uint8_t sku; /* SKU #, SMC HW 2:0 */
+ uint8_t slot; /* PCI-E slot, get from where ? */
+ uint8_t rev; /* Revision, component ID 16:19 */
+ uint8_t step; /* Stepping, component ID 12:15 */
+ uint8_t substep; /* Sub-stepping, component ID 8:11 */
+ uint8_t serial[MR_SENO_LEN]; /* Serial number, from SMC */
+} MrRspHwInf;
+
+
+
+/*
+ * MIC API version
+ * REQ_PVER Notes:
+ * - returns RAS_VER string the module was built with.
+ */
+
+typedef struct mr_rsp_pver {
+ char api[MR_PVER_LEN]; /* Ras module version */
+} MrRspPver;
+
+
+
+/*
+ * MIC uOS/Flash version
+ * REQ_VERS Notes:
+ * - unclear at this point what the lengths of these strings are.
+ * The limit of 128 bytes is a 'best safe guess' and may change.
+ * - KnF: My card has 3 flash strings, for now that's the count.
+ * - KnC: Has fewer defined version strings, currently only fboot0
+ * string has been defined.
+ */
+
+typedef struct mr_rsp_vers {
+ char fboot0[MR_VERS_LEN]; /* Fboot 0 version */
+ char fboot1[MR_VERS_LEN]; /* Fboot 1 version */
+ char flash[3][MR_VERS_LEN]; /* Flash block versions */
+ char uos[MR_VERS_LEN]; /* uOS kernel version */
+ char fsc[MR_VERS_LEN]; /* Fan controller version */
+} MrRspVers;
+
+
+
+/*
+ * Core frequency
+ * REQ_CFREQ Notes:
+ * - current is clock read from CURRENTRATIO register.
+ * - default/requested clock is read from COREFREQ register.
+ * In KnF, the CURRENTRATIO is not used and therefore
+ * COREFREQ s reported as current speed and the default
+ * is simply the first value registered (at module load).
+ * - supported speeds are part of freq/voltage pairs maintained
+ * by the cpu_freq driver as part of PM (cpu_freq driver).
+ * - unclear if we should allow manual control (writes).
+ */
+
+typedef struct mr_rsp_freq {
+ uint32_t cur; /* Actual core speed in kHz */
+ uint32_t def; /* Set core speed in kHz */
+ uint32_t slen; /* Supported count */
+ uint32_t supt[MR_PTAB_LEN]; /* Supported speed list in kHz */
+} MrRspFreq;
+
+/*
+ * Set core frequency
+ * New frequency (in kHz) passed in MrHdr.parm
+ * SET_CFREQ Notes:
+ * - need to turn off PM for this to stick
+ */
+
+
+
+/*
+ * Core voltage
+ * REQ_CVOLT Notes:
+ * - KnF: Two core voltages; current voltage set from COREVOLT
+ * register and sense1 read in the BOARD_VOLTAGE_SENSE register.
+ * - KnC: 3 potential sources; SVID, SMC, and SBOX registers.
+ * SBOX regs require SMC telemetry which is uncertain.
+ * SVID does not work in A0, B0 is TBD.
+ * SMC will eventually relay VR data.
+ * Only SVID gives both set and actual values.
+ * Only SMC sets c_val field, zero is good.
+ * - Supported voltages are either determined from what the VRs
+ * can support or if PM is active it is part of the freq/voltage pairs
+ * maintained by the cpu_freq driver as part of PM (cpu_freq driver).
+ */
+
+typedef struct mr_rsp_volt {
+ uint32_t cur; /* Core voltage read in uV */
+ uint32_t set; /* Core voltage set in uV */
+ uint8_t c_val; /* Valid bits, volt read */
+ uint32_t slen; /* Supported count */
+ uint32_t supt[MR_PTAB_LEN]; /* Supported voltage list in uV */
+} MrRspVolt;
+
+/*
+ * Set core voltage
+ * New voltage passed in MrHdr.parm
+ * SET_CVOLT Notes:
+ * - need to turn off PM for this to stick
+ * - Unclear if we should allow manual control through this API.
+ */
+
+
+
+/*
+ * Card power
+ * REQ_PWR Notes
+ * - Power status only avalable on KnC via SMC query
+ * - VR status on KnC may come from VRs directly or from SMC query
+ * - VR status on KnF comes from SBOX registers (telemtry)
+ * - If available, status bits from query is provided, zero is good.
+ */
+
+typedef struct mr_rsp_pws { /* Power sensor status */
+ uint32_t prr; /* Current reading, in uW */
+ uint8_t p_val; /* Valid bits, power */
+} MrRspPws;
+
+typedef struct mr_rsp_vrr { /* Voltage regulator status */
+ uint32_t pwr; /* Power reading, in uW */
+ uint32_t cur; /* Current, in uA */
+ uint32_t volt; /* Voltage, in uV */
+ uint8_t p_val; /* Valid bits, power */
+ uint8_t c_val; /* Valid bits, current */
+ uint8_t v_val; /* Valid bits, voltage */
+} MrRspVrr;
+
+typedef struct mr_rsp_power {
+ MrRspPws tot0; /* Total power, win 0 */
+ MrRspPws tot1; /* Total power, win 1 */
+ MrRspPws inst; /* Instantaneous power */
+ MrRspPws imax; /* Max instantaneous power */
+ MrRspPws pcie; /* PCI-E connector power */
+ MrRspPws c2x3; /* 2x3 connector power */
+ MrRspPws c2x4; /* 2x4 connector power */
+ MrRspVrr vccp; /* Core rail */
+ MrRspVrr vddg; /* Uncore rail */
+ MrRspVrr vddq; /* Memory subsystem rail */
+} MrRspPower;
+
+
+
+/*
+ * Power envelope
+ * REQ_PLIM Notes:
+ * - power envelope is a PM property. A physical limit
+ * is given to PM, which then calculate derivative high
+ * and low water mark figures.
+ * - values are retrieved from PM module
+ */
+
+typedef struct mr_rsp_plim {
+ uint32_t phys; /* Physical limit, in W */
+ uint32_t hmrk; /* High water mark, in W */
+ uint32_t lmrk; /* Low water mark, in W */
+} MrRspPlim;
+
+/*TBD
+ * Set power envelope
+ * New value passed in MrHdr.parm
+ * SET_PLIM Notes:
+ * - not sure if setting this should be allowed at all.
+ */
+
+
+
+/*
+ * Core information
+ * REQ_CLST Notes:
+ * - for the average user a core count is all required, since
+ * logically the cores are _always_ enumerated 0 .. <n>-1.
+ * Physical enumeration, such as ring stop, are not useful.
+ * - perhaps this request should return the CPU bitfields from
+ * the uOS of offline, online, possible, and present masks.
+ * Would allow watching of PM activity.
+ */
+
+typedef struct mr_rsp_clst {
+ uint16_t count; /* Cores present */
+ uint16_t thr; /* Threads per core */
+} MrRspClst;
+
+
+/*
+ * Set core enable/disable
+ * Core id & set/reset value passed in MrHdr.parm
+ * ENB_CORE/DIS_CORE Notes:
+ * - uOS Linux does not have write access to HW config in SPI flash.
+ * No way to enable/disable cores
+ * - only listed here since if compatibility with FreeBSD is needed.
+ */
+
+
+
+/*
+ * Memory device info
+ * REQ_GDDR Notes:
+ * - This is read from scratch9, i.e. provided by bootstrap.
+ */
+
+typedef struct mr_rsp_gddr {
+ char dev[MR_GVND_LEN]; /* Device vendor */
+ uint16_t rev; /* Device revision */
+ uint32_t size; /* Device size, in Mbit/device */
+ uint32_t speed; /* Transactions speed, kT/sec */
+} MrRspGddr;
+
+
+
+/*
+ * GDDR frequencies
+ * REQ_GFREQ Notes:
+ * - current clock can be read from MEMORYFREQ register
+ * - the GDDR nominal frequency is reported
+ * - the supported frequency list contains values that PLLs
+ * are capable of producing. Info is of limited use, since
+ * there is no way to control the GDDR frequency (locked by fuses).
+ */
+
+typedef struct mr_rsp_gfreq {
+ uint32_t cur; /* Current GDDR speed in kHz */
+ uint32_t def; /* Default GDDR speed in kHz */
+ uint32_t slen; /* Supported count */
+ uint32_t supt[MR_PTAB_LEN]; /* Supported speeds list in kHz */
+} MrRspGfreq;
+
+/*
+ * Set GDDR frequency
+ * New frequency passed in MrHdr.parm
+ * SET_GFREQ Notes:
+ * - uOS cannot alter the PLLs because it requires retraining, which
+ * causes loss of memory content.
+ * - KnF: uOS does not have write access to SPI flash, which is required
+ * to modify the GDDR frequency at next reboot.
+ * - KnC: GDDR frequency is hard locked by fuses, cannot change, ever!!!
+ */
+
+
+
+/*
+ * GDDR voltages
+ * REQ_GVOLT Notes:
+ * - KnF: Two GDDR voltages; current voltage set from MEMVOLT
+ * register and sense2 from BOARD_VOLTAGE_SENSE register.
+ * MEMVOLT register always returns zero, only sense2
+ * actually returns something useful in current Si.
+ * - KnC: 3 potential sources; SVID, SMC, and SBOX registers.
+ * SBOX regs require SMC telemetry which is uncertain.
+ * SVID does not work in A0, B0 is TBD.
+ * SMC will eventually relay VR data
+ * Only SVID gives both set and actual values.
+ * Only SMC sets c_val field, zero is good.
+ * - Supported voltages reported are voltages the VRs can be programmed
+ * to supply. Info is of limited use, since there is no way to control
+ * the GDDR voltage (locked by fuses).
+ */
+
+typedef struct mr_rsp_gvolt {
+ uint32_t cur; /* GDDR voltage read in uV */
+ uint32_t set; /* GDDR voltage set in uV */
+ uint8_t c_val; /* Valid bits, volt read */
+ uint32_t slen; /* Supported count */
+ uint32_t supt[MR_PTAB_LEN]; /* Supported voltage list in uV */
+} MrRspGvolt;
+
+/*
+ * Set GDDR voltage
+ * New voltage passed in MrHdr.parm
+ * SET_GVOLT Notes:
+ * - uOS cannot alter the VR settings at all. Even if it could
+ * then it still clash with the need to retrain and memory loss.
+ * - KnF: uOS does not have write access to SPI flash, which is required
+ * to modify the GDDR voltage at next reboot.
+ * - KnC: GDDR voltage is hard locked by fuses, cannot change, ever!!!
+ */
+
+
+
+/*
+ * Board temperatures
+ * REQ_TEMP Notes:
+ * - CPU die temps can be read from THERMAL_STATUS (highest
+ * of several sensors) and CURRENT_DIE_TEMP registers.
+ * The die sensors values do not match the status
+ * value, so the conversion formula or calibration
+ * needs a re-visit.
+ * - If we could get at them, we could provide readings
+ * from the following devices, but are they all useful?
+ * Fan inlet sensor
+ * Fan exhaust sensor
+ * GDDR temp (one chip is measured) sensor
+ * Vccp VR
+ * Vddg VR
+ * Vddq VR
+ * - most devices report current and maximum temperatures in
+ * degrees Celcius as a signed integer, 9 bits for die temp
+ * and 8 bits for voltage regulators, 12 bit for sensors.
+ */
+
+typedef struct mr_rsp_tsns {
+ int16_t cur; /* Current temperature, in C */
+ int8_t c_val; /* Valid bits, if available */
+} MrRspTsns;
+
+typedef struct mr_rsp_tdie {
+ int16_t cur; /* Current temperature, in C */
+ int16_t max; /* Maximum temperature, in C */
+} MrRspTdie;
+
+typedef struct mr_rsp_temp {
+ MrRspTsns die; /* Highest on-die measure */
+ MrRspTdie dies[MR_DIES_LEN]; /* All on-die measures */
+ MrRspTsns brd; /* Highest board measure */
+ MrRspTsns fin; /* Fan inlet */
+ MrRspTsns fout; /* Fan outlet */
+ MrRspTsns gddr; /* Gddr device */
+ MrRspTsns vccp; /* Vccp VR */
+ MrRspTsns vddg; /* Vddg VR */
+ MrRspTsns vddq; /* Vddq VR */
+} MrRspTemp;
+
+
+
+/*
+ * Fan speed
+ * REQ_FAN Notes:
+ * - fan status is reported in RPM and it's control is
+ * a pulse with modulation ratio to 255, i.e. 0 is min,
+ * 127 is ~50% and 255 is max.
+ * - the card has logic for controlling two fans.
+ * Only one is used and we only report status for one.
+ */
+
+typedef struct mr_rsp_fan {
+ uint16_t rpm; /* Fan speed, rpm */
+ uint8_t pwm; /* Active PWM ratio, 0..255 */
+ uint8_t override; /* Override flag */
+ uint8_t r_val; /* Valid bits, speed */
+ uint8_t p_val; /* Valid bits, PWM */
+} MrRspFan;
+
+/*
+ * Set fan speed
+ * Control is passed in MrHdr.parm (struct fits into 32 bit)
+ * SET_FAN Notes:
+ * - this may collide with OOB methods (such as IPMI)
+ * that has priority, no guarantee this will stick.
+ * - changing fan speed parameters may interfere
+ * with PM in undefined ways.
+ */
+
+typedef struct mr_set_fan {
+ uint8_t override; /* Override enable flag */
+ uint8_t pwm; /* Force PWM ratio, 0..255 */
+} MrSetFan;
+
+
+
+/*
+ * Error correction mode
+ * REQ_ECC Notes:
+ * - retrieve this info from one (any) of the gboxes.
+ */
+
+typedef struct mr_rsp_ecc {
+ uint32_t enable; /* ECC mode: 1 enabled, 0 disabled */
+} MrRspEcc;
+
+/*
+ * Set error correction mode
+ * New mode passed in MrHdr.parm
+ * SET_ECC Notes:
+ * - ECC cannot be changed on the fly by uOS, requires retraining
+ * of GDDR which causes loss of memory content.
+ * - uOS Linux does not have write access to HW config in SPI flash.
+ * No way to change ECC enable/disable setting.
+ */
+
+
+
+/*
+ * Trace level
+ * REQ_TRC Notes:
+ * - No idea what support this has in uOS Linux.
+ */
+
+typedef struct mr_rsp_trc {
+ uint32_t lvl; /* Debug trace level */
+} MrRspTrc;
+
+/*
+ * Set trace level
+ * New level passed in MrHdr.parm
+ * SET_TRC Notes:
+ * - No idea what this does in uOS Linux (nothing yet).
+ */
+
+
+
+/*
+ * Turbo setting
+ * REQ_TRBO Notes:
+ * - Retrieve current actual turbo mode and state
+ * - 'set' value: 1 if enabled, 0 otherwise
+ * - 'state' value: 1 if active, 0 otherwise
+ * - 'avail' value: 1 if TRBO supported, 0 otherwise
+ */
+
+typedef struct mr_rsp_trbo {
+ uint8_t set; /* Turbo mode */
+ uint8_t state; /* Turbo state */
+ uint8_t avail; /* Turbo mode available */
+ uint8_t pad; /* Pad to 32 bit */
+} MrRspTrbo;
+
+/*
+ * Set turbo mode
+ * New mode passed in MrHdr.parm
+ * SET_TRB Notes:
+ * - Set always allowed, but silently ignored is not available.
+ */
+
+
+
+/*
+ * LED override
+ * REQ_LED Notes:
+ * - KnC: Retrieve current LED mode setting, 0=normal, 1=identify
+ * - KnF: not implemented (error MR_ERR_UNSUP)
+ */
+
+typedef struct mr_rsp_led {
+ uint32_t led; /* LED mode setting */
+} MrRspLed;
+
+/*
+ * Set LED mode
+ * New mode passed in MrHdr.parm
+ * SET_LED Notes:
+ * - KnC: Mode values
+ * 0 is normal SMC control (fast blink)
+ * 1 is identify mode (2 blinks every 2 seconds)
+ * - KnF: not implemented (error MR_ERR_UNSUP)
+ */
+
+
+
+/*
+ * Overclocking
+ * REQ_OCLK Notes:
+ * - Curently no idea how to represent overclocking state
+ * - Overclocking not supported, return MR_RSP_NOVAL
+ */
+
+typedef struct mr_rsp_oclk {
+ uint32_t freq; /* Over clocking setting */
+} MrRspOclk;
+
+/*
+ * Set overclocking mode
+ * New mode passed in MrHdr.parm
+ * SET_OCLK Notes:
+ * - Overclocking not supported, return MR_RSP_NOVAL
+ */
+
+
+
+/*
+ * Processor utilization (OS status)
+ * REQ_CUTL Notes:
+ * - returned info is a simple sum of 4 logical CPUs
+ * - the counter units returned are Linux kernel jiffies,
+ * typically in range 1 - 10 ms, based on continous
+ * counters maintained by the kernel. The number of
+ * jiffies per second is reported for scaling purposes.
+ * In order to get a current 'utilization' figure, the
+ * host needs to query the counters at regular intervals
+ * and use this formula to achieve a percentage:
+ * u = ((c2 - c1) / (t2 - t1)) * 100
+ * or
+ * u = ((c2 - c1) * 100) / (t2 - t1)
+ * where t2 - t1 = elapsed jiffies between samples
+ * c2 - c1 = usage jiffy counts between samples
+ * - the listed counters does not add up to cover the
+ * wall clock time exactly, sampling errors do occur.
+ * - counters for iowait, irq, and softirq are not included.
+ * - jiffy counters are updated by the timer tick interrupt
+ * handler. It's accuracy is known to be limited, see
+ * Documentation/cpu-load.txt for details.
+ * - counters are reported regardless of core sleep states
+ */
+
+typedef struct mr_rsp_ccnt {
+ uint64_t user; /* Normal user mode jiffies */
+ uint64_t nice; /* 'Nice' user mode jiffies */
+ uint64_t sys; /* System mode jiffies */
+ uint64_t idle; /* Idle time jiffies */
+} MrRspCcnt;
+
+typedef struct mr_rsp_cutl {
+ uint32_t tck; /* Actual jiffs/sec (scaled by 256) */
+ uint16_t core; /* Cores reported on */
+ uint16_t thr; /* Threads per core */
+ uint64_t jif; /* Jiffy counter at query time */
+ MrRspCcnt sum; /* System wide counters */
+ MrRspCcnt cpu[MR_CORE_LEN]; /* Counters per core */
+} MrRspCutl;
+
+
+
+/*
+ * Memory utilization (OS status)
+ * REQ_MEM Notes:
+ * - memory snapshot is obtained from kernel structs.
+ * No walk of page descriptors is performed.
+ * - Not all memory stats are visible (exported to) modules.
+ *
+ *TBD:
+ * - Need clarification on what memory utilization means.
+ * For now the total, free and buffer memory is reported.
+ */
+
+typedef struct mr_rsp_mem {
+ uint32_t total; /* Total usable RAM in kB */
+ uint32_t free; /* Free memory in kB */
+ uint32_t bufs; /* Buffer storage in kB */
+} MrRspMem;
+
+
+
+/*
+ * Process management (OS status)
+ * REQ_OS/REQ_PROC/REQ_THRD Notes:
+ * - split in 3 levels of detail:
+ * 1) Get set of applications (exclude kernel processes and threads)
+ * 2) Get details on specified application (pid in MrHdr.parm),
+ * which includes a thread pid list (up to 256 threads).
+ * 3) Get details on specific thread (thread id in MrHdr.parm)
+ * Opcodes 2 and 3 will, apart from thread list, mostly report the same
+ * set of details. What needs monitoring (see 'man proc', section on
+ * /proc/<pid>/stat and /proc/<pid>/status for what's available)?
+ * - process time counters are continuous, so if any ratio between
+ * the time a process/thread spends and actual wall clock time is
+ * to be calculated, the same logic for dynamic display applies as
+ * for the CUTL counters. I.e. a jiffy stamp is needed in the reply.
+ *TBD:
+ * - Introduce some sanity in time measurements.
+ * - Level 3 (thread details) is not implemented (is it needed ?).
+ * - Add ppid & credentials in MrRspProc? Needed to make a "top" display.
+ */
+
+typedef struct mr_rsp_os {
+ uint64_t uptime; /* Seconds since OS boot */
+ uint64_t loads[3]; /* 1, 5, 15 minute load average */
+ uint32_t alen; /* Application count */
+ uint32_t apid[256]; /* Application PIDs */
+} MrRspOs;
+
+typedef struct mr_rsp_proc {
+ uint32_t pid; /* Process ID */
+ char name[16]; /* Program name (less path) */
+ uint64_t utime; /* User time in uS */
+ uint64_t stime; /* System time in uS */
+ uint64_t etime; /* Elapsed time in uS */
+ uint32_t rss; /* Resident set, in kB */
+ uint32_t vm; /* VM size, in kB */
+ uint32_t tlen; /* Thread count */
+ uint32_t tpid[256]; /* Process threads */
+} MrRspProc;
+
+
+
+/*
+ * Terminate process
+ * Signal passed in MrHdr.parm bits 31:24 (see 'kill -l')
+ * Process ID passed in MrHdr.parm bits 23:0 (see /proc/sys/kernel/pid_max)
+ * CMD_PKILL Notes:
+ * - This is specifically for MPI style cluster managers
+ * who wants to rid the card of a specific process.
+ * - Processes owned by users ID's less than 500 are immune to this.
+ */
+
+
+
+/*
+ * Terminate user
+ * Signal passed in MrHdr.parm bits 31:24 (see 'kill -l')
+ * User ID passed in MrHdr.parm bits 23:0 (see /etc/login.defs).
+ * CMD_UKILL Notes:
+ * - This is specifically for MPI style cluster managers to
+ * rid the card of processes owned by a specific user ID.
+ * - User ID's below 500 will silently be ignored.
+ */
+
+
+
+/*
+ * Read SMC register
+ * MR_GET_SMC Notes:
+ * - Both SMC and FSC devices are accessed through I2C busses, which
+ * means that retrieval will be slow (order of milli seconds).
+ * - KnC: allows direct access to the SMC CSRs, which can be read
+ * or written in any random order.
+ * SMC CSR definitions are not within the scope of this API.
+ * Register number passed in MrHdr.parm bits 7:0 (8 bits).
+ * SMC registers are 32 bit, except one (UUID) that is 16 byte.
+ * - KnF: allows direct access to the fan speed controller (FSC)
+ * status registers on board temp and power sensors.
+ * The FSC execute command register every 50 mSec, which means
+ * that register needs 'SET' and hold for 50 mSec before any
+ * value can be returned. For telemetry data the SET is done
+ * implicitly, all other has to execute a 'SET' before running
+ * a 'GET' command.
+ *
+ FSC register definitions are not within the scope of this API.
+ * All sensor data returns are 8 bit wide.
+ */
+
+typedef struct mr_rsp_smc {
+ uint8_t reg; /* Register number */
+ uint16_t width; /* Valid return bytes (4 or 16) */
+ union {
+ uint32_t val; /* Requested register value */
+ uint8_t uuid[16]; /* Unique identifier */
+ uint8_t serial[12]; /* Card serial number */
+ } rtn;
+} MrRspSmc;
+
+/*
+ * Write SMC register
+ * Register number passed in MrHdr.parm bits 31:24 (8-bit address decode).
+ * Register value passed in MrHdr.parm bits 23:0 (24 bit data).
+ * MR_SET_SMC Notes:
+ * - Improper use of this command can cause thermal shutdown of the card.
+ * - Improper use can interfere with power management.
+ * - KnC: For security reasons only the following registers are writeable:
+ * 20, 22 IPMI <not documented>
+ * 2b, 2c, 2d, 2f, 30, 31, 32, 33 PM control parameters
+ * 4b Fan Adder
+ * 60 LED control
+ * No SMC registers of interest are more than 16 bits wide.
+ * - KnF: For security reasons only the followingregisters are writable:
+ * 0 Fan 1 Speed Override
+ * 1 Power Management and Control Config
+ * 11 General Status command
+ * Selector is 8 bits wide and only valid values are
+ * 20, 21, 22, 23 Power sensors, 1s avg.
+ * 30, 31, 32, 33 Power sensors, 1 sample
+ * a1, a2, a3, a4, a5 Max temps
+ */
+
+
+
+/*
+ * Get PM config mode
+ * REQ_PMCFG notes:
+ * - Return value is reported 'as-is' from the PM module.
+ */
+
+typedef struct mr_rsp_pmcfg {
+ uint32_t mode; /* Current PM operation mode */
+} MrRspPmcfg;
+
+
+
+/*
+ * Read Power triggers
+ * Consist of two trigger points (power,time), which can be calculated
+ * from SKU at card power-on or be persistent across reboots.
+ * At trigger (PROCHOT), GPU Hot gets asserted
+ * At trigger (PWRALT), Power Alert gets asserted
+ *
+ * MR_REQ_PROCHOT, MR_REQ_PWRALT Notes:
+ * - KnC: Read SMC registers for trigger 0 and 1 respectively.
+ * GPUHOT: registers 0x2c and 0x2d
+ * PWRALT: registers 0x2f and 0x30
+ * - KnF: not implemented (error MR_ERR_UNSUP)
+ */
+
+typedef struct mr_rsp_ptrig {
+ uint16_t power; /* Power limit, Watt */
+ uint16_t time; /* Time windows, mSec */
+} MrRspPtrig;
+
+/*
+ * Write Power triggers
+ * MR_SET_PROCHOT, MR_SET_PWRALT Notes
+ * Structure MrRspPtrig passed in MrHdr.parm
+ * Trigger PROCHOT.power must be higher than trigger PWRALT.power.
+ * - KnC: Write SMC registers for trigger 0 and 1 respectively.
+ * GPUHOT: registers 0x2c and 0x2d
+ * PWRALT: registers 0x2f and 0x30
+ * - KnF: not implemented (error MR_ERR_UNSUP)
+ * Warning: MT does not check for GPUHOT.power >= PWRALT.power.
+ *TBD: Should it?
+ * It is anticipated that changes follows reads, i.e. checking
+ * can be checked in application software.
+ */
+
+
+
+/*
+ * Read Persistent Power triggers flag
+ * If set, changes to Power Triggers will be permanent
+ * MR_REQ_PERST Notes:
+ * - KnC: Reads bit 0 of SMC register 0x32
+ * - KnF: not implemented (error MR_ERR_UNSUP)
+ */
+
+typedef struct mr_rsp_perst {
+ uint32_t perst; /* Persistent power triggers */
+} MrRspPerst;
+
+/*
+ * Write Persistent Power triggers flag
+ * New value passed in MrHdr.parm
+ * MR_SET_PERST Notes:
+ * - KnC: Writes bit 0 of SMC register 0x32
+ * - KnF: not implemented (error MR_ERR_UNSUP)
+ */
+
+
+/*
+ * Read Throttle states
+ * Returns status of current and previous throttle state
+ * retrieved from the card side PM module.
+ * MR_REQ_TTL Notes:
+ * - KnC: Calls PM for latest information.
+ * Note that the 'active' flags can toggle very often,
+ * which may make it less informative for display.
+ * Time tracked in jiffies, not true mSec resolution.
+ * - KnF: not implemented (error MR_ERR_UNSUP)
+ */
+
+typedef struct mr_rsp_tstat {
+ uint8_t active; /* Currently active */
+ uint32_t since; /* Length of current throttle, mSec */
+ uint32_t count; /* Number of throttles */
+ uint32_t time; /* Total time throttled, mSec */
+} MrRspTstat;
+
+typedef struct mr_rsp_ttl {
+ MrRspTstat thermal; /* Thermal throttle state */
+ MrRspTstat power; /* Power throttle state */
+ MrRspTstat alert; /* Power alert state */
+} MrRspTtl;
+
+
+#pragma pack(pop) /* Restore to entry conditions */
+
+#ifdef __cplusplus
+} /* C++ guard */
+#endif
+
+#endif /* Recursion block */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * RAS MT module, common code
+ *
+ * Code and data structures to handle get/set tasks for KnC and KnF.
+ * Parties accessing the data structures are supposed to use the
+ * micras_mt_tsk() routines to ensure integrity and consistency.
+ * Particularly important when handling sysfs nodes and actions
+ * requested from SCIF connections must use that method in order
+ * to guarantee serialized access.
+ *
+ * Even if read-only access to latest valid data is required,
+ * it should go through micras_mt_tsk() using dedicated handlers
+ * in this module.
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/io.h>
+#include <linux/utsname.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/jiffies.h>
+#include <linux/tick.h>
+#include <linux/kernel_stat.h>
+#include <linux/bitmap.h>
+#include <generated/compile.h>
+#include <generated/utsrelease.h>
+#include <asm/mic/mic_knc/autobaseaddress.h>
+#include <asm/mic/mic_knc/micsboxdefine.h>
+#include "micras.h"
+
+
+/*
+ * Persistent data accessible through the CP api.
+ * Some functions just read/modify hardware CSRs
+ * and thus need no storage between invocations.
+ */
+
+ struct mr_rsp_hwinf hwinf; /* Card specific */
+ struct mr_rsp_vers vers; /* Card specific */
+static struct mr_rsp_pver pver;
+ struct mr_rsp_freq freq; /* Card specific */
+ struct mr_rsp_volt volt; /* Card specific */
+ struct mr_rsp_power power; /* Card specific */
+ struct mr_rsp_plim plim; /* Card specific */
+static struct mr_rsp_clst clst;
+ struct mr_rsp_gddr gddr;
+ struct mr_rsp_gfreq gfreq; /* Card Specific */
+ struct mr_rsp_gvolt gvolt; /* Card specific */
+ struct mr_rsp_temp temp; /* Card specific */
+ struct mr_rsp_ecc ecc; /* Card specific */
+static struct mr_rsp_trc trc;
+ struct mr_rsp_trbo trbo; /* Card specific */
+ struct mr_rsp_pmcfg pmcfg; /* Card specific */
+
+
+/*
+ * Map of SKUs for KnX cards (currently known, will change)
+ * The SKU is identified solely from the PCIe ID and sub-ID.
+ * A zero sub-ID is a don't care.
+ *
+ *TBD: core counts in KnF needs update, not all have 32.
+ *
+ * Notes:
+ * - Unless the PCIe subID differs, there are two 2250 cards
+ * that can't be distinguished from each other, one has 8 TXs
+ * and the other has none. PO cards -> impact only internal.
+ * - Not sure exactly what 2254 is, suspect MPI prototype.
+ */
+
+#define VD(v, d) (PUT_BITS(15,0,(v)) | PUT_BITS(31,16,(d)))
+
+static struct sku {
+ uint32_t devID; /* PCIe Vendor and device ID */
+ uint32_t subID; /* PCIe Sub- Vendor and device ID */
+ uint8_t revNo; /* PCIe Revision number */
+ uint8_t cr; /* Core count */
+ uint8_t ch; /* Memory channels */
+ uint8_t tx; /* TX samplers (only in KnC) */
+ char * name; /* SKU name */
+} skuList[] = {
+ { VD(0x8086, 0x2240), 0, 0x00, 32, 8, 0, "E1" }, /* KnF */
+ { VD(0x8086, 0x2241), 0, 0x00, 32, 8, 0, "E2" }, /* KnF */
+ { VD(0x8086, 0x2242), 0, 0x00, 32, 8, 0, "E3" }, /* KnF */
+ { VD(0x8086, 0x2243), 0, 0x00, 32, 8, 0, "E3" }, /* KnF */
+ { VD(0x8086, 0x2249), VD(0x8086, 0xed08), 0, 32, 4, 0, "Ed" }, /* KnF */
+ { VD(0x8086, 0x2249), VD(0x8086, 0xed0a), 0, 32, 4, 0, "Eb" }, /* KnF */
+ { VD(0x8086, 0x224a), 0, 0x00, 32, 8, 0, "Eb" }, /* KnF */
+
+ { VD(0x8086, 0x2250), 0, 0x00, 60, 16, 0, "SKU1/SKU2" }, /* KnC: ES1, ES1B */
+ { VD(0x8086, 0x2250), 0, 0x10, 60, 16, 0, "SKU2" }, /* KnC: ES2 */
+ { VD(0x8086, 0x2250), 0, 0x11, 60, 16, 0, "SKU2" }, /* KnC: Mkt2 */
+ { VD(0x8086, 0x2250), 0, 0x20, 60, 16, 0, "SKU2" },
+ { VD(0x8086, 0x2251), 0, 0x00, 48, 16, 8, "SKU2" },
+ { VD(0x8086, 0x2252), 0, 0x00, 48, 16, 0, "SKU3" },
+ { VD(0x8086, 0x2253), 0, 0x00, 40, 8, 0, "SKU4/SKU5" }, /* KnC: ES0, ES1 */
+ { VD(0x8086, 0x2253), 0, 0x10, 40, 8, 0, "SKU5" },
+ { VD(0x8086, 0x2254), 0, 0x00, 62, 16, 0, "??" }, /* KnC: ?? */
+ { VD(0x8086, 0x2255), 0, 0x00, 62, 16, 8, "SKUX" }, /* KnC: A0-PO */
+ { VD(0x8086, 0x2256), 0, 0x00, 48, 12, 7, "SKU5" }, /* KnC: A0-PO */
+ { VD(0x8086, 0x2257), 0, 0x00, 4, 16, 0, "SKUZ" },
+ { VD(0x8086, 0x2258), 0, 0x00, 62, 16, 0, "SKU1" }, /* KnC: ES1, ES1B */
+ { VD(0x8086, 0x2258), 0, 0x10, 62, 16, 0, "SKU1" },
+ { VD(0x8086, 0x2259), 0, 0x00, 52, 16, 0, "SKU3" }, /* KnC: ES1 */
+ { VD(0x8086, 0x225a), 0, 0x00, 48, 12, 0, "SKU4" }, /* KnC: ES1, ES1B */
+ { VD(0x8086, 0x225a), 0, 0x10, 48, 12, 0, "SKU4" }, /* KnC: ES2 */
+ { VD(0x8086, 0x225a), 0, 0x11, 48, 12, 0, "SKU4" }, /* KnC: Int5 */
+ { VD(0x8086, 0x225b), 0, 0x00, 52, 12, 0, "SKU3" },
+ { VD(0x8086, 0x225b), 0, 0x10, 52, 12, 0, "SKU3" },
+ { VD(0x8086, 0x225c), 0, 0x10, 61, 16, 0, "SKU1" }, /* KnC: Mkt1 */
+ { VD(0x8086, 0x225c), 0, 0x11, 61, 16, 0, "SKU1" }, /* KnC: Mkt1 */
+ { VD(0x8086, 0x225c), 0, 0x20, 61, 16, 0, "SKU1" }, /* KnC: Mkt1 */
+ { VD(0x8086, 0x225d), 0, 0x10, 57, 12, 0, "SKU4" }, /* KnC: Mkt4 */
+ { VD(0x8086, 0x225d), 0, 0x11, 57, 12, 0, "SKU4" }, /* KnC: Mkt3, Mkt4 */
+ { VD(0x8086, 0x225d), 0, 0x20, 57, 12, 0, "SKU4" },
+ { VD(0x8086, 0x225e), 0, 0x11, 57, 16, 0, "GZ" },
+ { VD(0x8086, 0x225e), 0, 0x20, 57, 16, 0, "GZ" },
+};
+
+
+/*
+ * Map of GDDR vendor ID vs company names
+ */
+
+static struct {
+ int id;
+ char * vendor;
+} GddrVendors[] = {
+ { 1, "Samsung" },
+ { 2, "Quimonda" },
+ { 3, "Elpida" },
+ { 6, "Hynix" },
+};
+
+
+
+/*
+**
+** Initializations
+**
+** This has two intended purposes:
+** - Do a on-time effort to collect info on properties that
+** are not going to change after the initial setup by
+** either bootstrap or kernel initialization.
+** - Collect initial values on things we can modify.
+** Intent is that unloading the ras module should reset
+** all state to that of the time the module was loaded.
+**
+*/
+
+void __init
+mr_mt_init(void)
+{
+ static int only_once = 1;
+ uint32_t scr4, scr9, scr13;
+ uint32_t eax, ebx, ecx, edx;
+ uint32_t thr, hwt;
+ uint32_t id;
+ int i;
+
+ if (! only_once)
+ return;
+ only_once = 0;
+
+ /*
+ * HWINF:
+ * Scratch register 13 has more info than the hwinf record
+ * currently can contain, may revisit.
+ * 3:0 Substepping
+ * 7:4 Stepping (0 A, 2&3 B, 4 C, 6 D)
+ * 11:8 Model
+ * 15:12 Family (11 KnF)
+ * 17:16 Processor
+ * 19:18 Platform (0 Silicon, 1 FSIM, 2 MCEMU)
+ * 23:20 Extended model
+ * 31:24 Extended family
+ *
+ * Valid KnF steppings (Step + Substep):
+ * "A0" (0 + 0), "A1" (0 + 1), "A2" (0 + 2),
+ * "B0" (2 + 0), "B1" (3 + 1), "C0" (4 + 0),
+ * "D0" (6 + 0)
+ * Valid KnC steppings (Step + Substep):
+ * TBD:
+ */
+ scr13 = mr_sbox_rl(0, SBOX_SCRATCH13);
+ hwinf.rev = GET_BITS(11, 8, scr13);
+ hwinf.step = GET_BITS( 7, 4, scr13);
+ hwinf.substep = GET_BITS( 3, 0, scr13);
+
+ /*
+ * VERS:
+ * Add OS version
+ */
+ vers.uos[0] = scnprintf(vers.uos + 1, MR_VERS_LEN -2,
+ "Linux version: %s (build %s)",
+ init_uts_ns.name.release,
+ init_uts_ns.name.version);
+
+ /*
+ * PVERS:
+ * Make MicRas version available
+ */
+ pver.api[0] = scnprintf(pver.api + 1, MR_PVER_LEN -2,
+ "%s", RAS_VER);
+
+ /*
+ * CLST:
+ * On regular CPU's this is read from CPUID 2 (htt cores)
+ * and CPUID 4 (die cores), threads per cores is htt/die.
+ * This does not work the same way in MIC, cores & threads
+ * per core on various SKUs is not reflected by the CPUIDs.
+ * All we have is the number of registered APIC IDs, which
+ * happens to be the same as logical CPUs (htt cores).
+ * The threads per core (die cores) is given by bootstrap in
+ * scratch register #4 as a bit field.
+ * 3:0 Threads per core (mask)
+ * 5:4 Cache size (0,1,2: 512K, 3: 256K)
+ * 9:6 GBOX channel count (0 based)
+ * 29:25 ICC divider for MCLK
+ * 30 Soft reset boot
+ * 31 Internal flash build
+ */
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+ hwt = GET_BITS(23, 16, ebx);
+ if (hwt > nr_cpu_ids)
+ hwt = nr_cpu_ids;
+ scr4 = mr_sbox_rl(0, SBOX_SCRATCH4);
+ thr = GET_BITS(3, 0, scr4);
+ thr = bitmap_weight((const unsigned long *) &thr, 4);
+ if (thr) {
+ if (hwt % thr)
+ printk("mr_mt_init: cpu/thr mismatch: hwt %d, thr %d, cor %d, (%d)\n",
+ hwt, thr, hwt / thr, hwt % thr);
+ clst.thr = thr;
+ }
+ else {
+ printk("Who trashed scratch #4? Val 0x%08x => 0 threads/core?\n", scr4);
+ clst.thr = 4; /* Best guess */
+ }
+ clst.count = hwt / 4;
+
+ /*
+ * GDDR:
+ * Bootstrap leaves information in scratch register #9
+ * about the GDDR devices. The layout is:
+ * 3:0 Vendor ID, see table GddrVendors above
+ * 7:4 Revision
+ * 9:8 Density (00 = 512, 01 = 1024, 02 = 2048)
+ * 11:10 FIFO depth
+ * 15:12 DRAM info ??
+ * 29 ECC enable
+ */
+ scr9 = mr_sbox_rl(0, SBOX_SCRATCH9);
+ id = GET_BITS(3, 0, scr9);
+ for(i = 0; i < ARRAY_SIZE(GddrVendors); i++)
+ if (GddrVendors[i].id == id) {
+ gddr.dev[0] = scnprintf(gddr.dev +1, MR_GVND_LEN -2,
+ "%s", GddrVendors[i].vendor);
+ break;
+ }
+ if (i == ARRAY_SIZE(GddrVendors))
+ gddr.dev[0] = scnprintf(gddr.dev +1, MR_GVND_LEN -2, "Vendor %d", id);
+ gddr.rev = GET_BITS(7, 4, scr9);
+ gddr.size = 512 * (1 << GET_BITS(9, 8, scr9));
+
+ /*
+ * Card specific initialization
+ */
+ mr_mt_card_init();
+
+ /*
+ *TBD: Save commmon registers this module may change
+ */
+}
+
+void __exit
+mr_mt_exit(void)
+{
+ /*
+ * Card specific clean-up
+ */
+ mr_mt_card_exit();
+
+ /*
+ *TBD: Restore commmon registers this module may change
+ */
+}
+
+
+/*
+ * Return SKU properties for this card (as string)
+ * Processor can be identified on it's own easily,
+ * but the SKU reflects the impact of fuse changes
+ * which don't alter the CPU id.
+ *
+ * SKU properties:
+ * - name Name of sku (if known)
+ * - mch Number of memory channels
+ * - txs Number of texture samplers
+ */
+
+/*
+ * Why are these not defined in the includes?
+ */
+
+#ifndef SBOX_PCIE_VENDOR_ID_DEVICE_ID
+#define SBOX_PCIE_VENDOR_ID_DEVICE_ID 0x00005800
+#endif
+#ifndef SBOX_PCIE_PCI_SUBSYSTEM
+#define SBOX_PCIE_PCI_SUBSYSTEM 0x0000582c
+#endif
+#ifndef SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8
+#define SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8 0x00005808
+#endif
+
+static struct sku *
+get_sku(void)
+{
+ static struct sku * sku;
+ uint32_t dev, sub, rev, fuse;
+ char * grp;
+ int i;
+
+ if (sku)
+ return sku;
+
+ dev = mr_sbox_rl(0, SBOX_PCIE_VENDOR_ID_DEVICE_ID);
+ rev = mr_sbox_rl(0, SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8);
+ sub = mr_sbox_rl(0, SBOX_PCIE_PCI_SUBSYSTEM);
+ fuse = mr_sbox_rl(0, SBOX_SCRATCH7);
+ rev = GET_BITS(7, 0, rev);
+ fuse = GET_BITS(15, 0, fuse);
+
+ /*
+ * Usually the fuse revision define a group of SKUs.
+ * Once that's determined we'll use the other details
+ * to identify the SKU within that group.
+ */
+ if (fuse >= 0 && fuse <= 1)
+ grp = "A0 PO";
+ else if (fuse >= 2 && fuse <= 3)
+ grp = "A0 ES1";
+ else if (fuse >= 4 && fuse <= 50)
+ grp = "A0 ES1B";
+ else if (fuse >= 51 && fuse <= 100)
+ grp = "B0 PO";
+ else if (fuse >= 101 && fuse <= 150)
+ grp = "B0 ES2";
+ else if (fuse >= 151 && fuse <= 152)
+ grp = "B1 PO";
+ else if (fuse >= 153 && fuse <= 154)
+ grp = "B1 PO";
+ else if (fuse == 155)
+ grp = "B1 QS";
+ else if (fuse == 156)
+ grp = "B1 PRQ";
+ else if (fuse == 157)
+ grp = "B1 PRQ/GZ";
+ else if (fuse >= 158 && fuse <= 159)
+ grp = "B1 PRQ";
+ else if (fuse >= 201 && fuse <= 203)
+ grp = "B2 PRQ/QS";
+ else if (fuse == 253)
+ grp = "C0 PO";
+ else if (fuse == 254)
+ grp = "C0 QS";
+ else
+ grp = "???";
+
+ /*
+ * Now determine which member of the group.
+ * Take hints from PCIe device ID and revision.
+ * Device ID mappings is a mess, see table above.
+ * Revision has a simple mapping (follows fuses):
+ * 0x00 => A0 cards
+ * 0x10 => B0 cards
+ * 0x11 => B1 cards
+ * 0x20 => C0 cards
+ * 0x21 => C1 cards (if ever to be made)
+ */
+ for(i = 0; i < ARRAY_SIZE(skuList); i++) {
+ if (dev == skuList[i].devID) {
+ if (skuList[i].subID && sub != skuList[i].subID)
+ continue;
+ if (rev != skuList[i].revNo)
+ continue;
+
+ /*
+ * Found one, this is the place to cross reference it
+ * - memory channels should match SCR4 bits 9:6
+ */
+ break;
+ }
+ }
+
+ if (i < ARRAY_SIZE(skuList)) {
+ sku = skuList + i;
+ printk("RAS: card %x:%x:%x is a \"%s %s\" (%d cores, %d memch, %d txs)\n",
+ dev, sub, rev, grp, sku->name, sku->cr, sku->ch, sku->tx);
+ }
+
+ return sku;
+}
+
+#if NOT_YET
+char *
+mr_sku(void)
+{
+ struct sku * sku;
+
+ sku = get_sku();
+ return sku ? sku->name : 0;
+}
+#endif
+
+int
+mr_mch(void)
+{
+ struct sku * sku;
+
+ sku = get_sku();
+ return sku ? sku->ch : 0;
+}
+
+int
+mr_txs(void)
+{
+ struct sku * sku;
+
+ sku = get_sku();
+ return sku ? sku->tx : 0;
+}
+
+
+/*
+**
+** MT Get functions
+**
+** All works the same way; they get an opague pointer to
+** a place where the return structure can be placed. The
+** return value is either the amount (bytes) to be shipped
+** back in response or one of the MR_* error codes.
+**
+*/
+
+int
+mr_get_hwinf(void * p)
+{
+ struct mr_rsp_hwinf * r;
+
+ r = (struct mr_rsp_hwinf *) p;
+ *r = hwinf;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_vers(void * p)
+{
+ struct mr_rsp_vers * r;
+
+ r = (struct mr_rsp_vers *) p;
+ *r = vers;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_pver(void * p)
+{
+ struct mr_rsp_pver * r;
+
+ r = (struct mr_rsp_pver *) p;
+ *r = pver;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_clst(void * p)
+{
+ struct mr_rsp_clst * r;
+
+ r = (struct mr_rsp_clst *) p;
+ *r = clst;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_gddr(void * p)
+{
+ struct mr_rsp_gddr * r;
+
+ r = (struct mr_rsp_gddr *) p;
+ *r = gddr;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_trc(void * p)
+{
+ struct mr_rsp_trc * r;
+
+ r = (struct mr_rsp_trc *) p;
+ *r = trc;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_cutl(void * p)
+{
+ struct mr_rsp_cutl * r;
+ struct timespec tp;
+ struct cpu_usage_stat * u;
+ uint64_t user, nice, sys, idle;
+ int i, n;
+
+ r = (struct mr_rsp_cutl *) p;
+ memset(r, '\0', sizeof(*r));
+ r->tck = ACTHZ;
+ r->core = clst.count;
+ r->thr = clst.thr;
+ ktime_get_ts(&tp);
+ monotonic_to_bootbased(&tp);
+ r->jif = timespec_to_jiffies(&tp);
+
+ for_each_possible_cpu(i) {
+ u = & kstat_cpu(i).cpustat;
+
+ user = u->user;
+ nice = u->nice;
+ sys = u->system + u->irq + u->softirq;
+ idle = u->idle + u->iowait;
+
+ r->sum.user += user;
+ r->sum.nice += nice;
+ r->sum.sys += sys;
+ r->sum.idle += idle;
+
+ /*
+ * Currently the boot processor is thread 0 of the last
+ * enabled core. Thus, on a 32 core machine, we get:
+ *
+ * cpu # 0 1 2 3 4 5 .. 124 125 126 127
+ * core # 31 0 0 0 0 1 .. 30 31 31 31
+ * apic ID 124 0 1 2 3 4 .. 123 125 126 127
+ *
+ * The core is included in the per-cpu CpuInfo struct,
+ * and it should be safe to get it from there.
+ */
+ n = cpu_data(i).cpu_core_id;
+ if (n < r->core) {
+ r->cpu[n].user += user;
+ r->cpu[n].nice += nice;
+ r->cpu[n].sys += sys;
+ r->cpu[n].idle += idle;
+ }
+ }
+
+ return sizeof(*r);
+}
+
+
+int
+mr_get_mem(void * p)
+{
+ struct mr_rsp_mem * r;
+ struct sysinfo si;
+
+ si_meminfo(&si);
+
+ r = (struct mr_rsp_mem *) p;
+ memset(r, '\0', sizeof(*r));
+ r->total = si.totalram << (PAGE_SHIFT - 10);
+ r->free = si.freeram << (PAGE_SHIFT - 10);
+ r->bufs = si.bufferram << (PAGE_SHIFT - 10);
+
+ return sizeof(*r);
+}
+
+
+int
+mr_get_os(void * p)
+{
+ struct mr_rsp_os * r;
+ uint16_t i;
+ struct timespec tp;
+ struct task_struct * t;
+
+ ktime_get_ts(&tp);
+ monotonic_to_bootbased(&tp);
+
+ r = (struct mr_rsp_os *) p;
+ memset(r, '\0', sizeof(*r));
+ r->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+ r->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
+ r->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
+ r->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+
+ /*
+ * Walk process list and indentify processes that
+ * are associated with user programs. For now we
+ * exclude kernel threads and non-stable processes.
+ *
+ *TBD: Really just wanted to take the task_lock, but
+ * it is not exported to modules. It seems to be
+ * tied into the RCU logic, so locking the whole
+ * RCU should do the trick as long as it's just
+ * for a very short time.
+ */
+ i = 0;
+ rcu_read_lock();
+ for_each_process(t) {
+ if ((t->flags & (PF_KTHREAD | PF_STARTING | PF_EXITING)) ||
+ (t->group_leader && t->group_leader != t))
+ continue;
+
+ if (i < ARRAY_SIZE(r->apid))
+ r->apid[i] = t->pid;
+ i++;
+ }
+ rcu_read_unlock();
+ r->alen = i;
+
+ return sizeof(*r);
+}
+
+
+int
+mr_get_proc(void * p)
+{
+ struct mr_rsp_proc * r;
+ struct task_struct * t, * s;
+ struct mm_struct * mm;
+ struct timespec uptime, start, ts;
+ cputime_t utime, stime;
+ pid_t pid;
+ int err, i;
+
+ err = -MR_ERR_NOVAL;
+ pid = * (uint32_t *) p;
+ if (! pid)
+ return err;
+
+ r = (struct mr_rsp_proc *) p;
+ memset(r, '\0', sizeof(*r));
+ do_posix_clock_monotonic_gettime(&uptime);
+
+ rcu_read_lock();
+ t = pid_task(find_pid_ns(pid, &init_pid_ns), PIDTYPE_PID);
+ if (t) {
+ /*
+ * Found process, get base stats
+ */
+ r->pid = pid;
+ strncpy(r->name +1, t->comm, sizeof(r->name) -1);
+ start = t->start_time;
+ utime = t->utime;
+ stime = t->stime;
+ mm = get_task_mm(t);
+ if (mm) {
+#ifdef SPLIT_RSS_COUNTING
+ r->rss = atomic_long_read(& mm->rss_stat.count[MM_FILEPAGES]) +
+ atomic_long_read(& mm->rss_stat.count[MM_ANONPAGES]);
+#else
+ r->rss = mm->rss_stat.count[MM_FILEPAGES] +
+ mm->rss_stat.count[MM_ANONPAGES];
+#endif
+ r->vm = mm->total_vm;
+ mmput(mm);
+ }
+
+ /*
+ * Next try get list of threads (if any)
+ */
+ i = 0;
+ if (!t->group_leader || t->group_leader == t) {
+ s = t;
+ do {
+ if (s->pid != pid) {
+ if (i < ARRAY_SIZE(r->tpid))
+ r->tpid[i++] = s->pid;
+ }
+ } while_each_thread(t, s);
+ }
+ r->tlen = i;
+ err = sizeof(*r);
+ }
+ rcu_read_unlock();
+
+ /*
+ * Convert values into API formats (uSec, kB).
+ */
+ if (err > 0) {
+ r->name[0] = strlen(r->name +1);
+ ts = timespec_sub(uptime, start);
+ r->etime = timespec_to_ns(&ts) / NSEC_PER_USEC;
+ r->utime = jiffies_to_usecs(utime);
+ r->stime = jiffies_to_usecs(stime);
+ r->vm = r->vm << (PAGE_SHIFT - 10);
+ r->rss = r->rss << (PAGE_SHIFT - 10);
+ }
+
+ return err;
+}
+
+
+
+/*
+**
+** MT Set functions
+**
+** All works the same way; they get an opague pointer to
+** a location where the 'set' parameter from the request is
+** placed. Return code is one of the MR_* error codes.
+**
+** Input screening takes place here (to the extent possible).
+**
+*/
+
+
+#if NOT_YET
+int
+mr_set_gvolt(void * p)
+{
+ /*
+ * Cannot be set from uOS, pretend success
+ */
+ return 0;
+}
+
+
+int
+mr_set_gfreq(void * p)
+{
+ /*
+ * Cannot be set from uOS, pretend success
+ */
+ return 0;
+}
+#endif
+
+
+int
+mr_set_trc(void * p)
+{
+ /*
+ * No idea on what to do with this
+ */
+ trc.lvl = *(uint32_t *) p;
+ return 0;
+}
+
+
+
+/*
+**
+** MT Process controls
+**
+*/
+
+int
+mr_cmd_pkill(void * p)
+{
+ struct task_struct * t;
+ const struct cred * cred;
+ pid_t pid;
+ uint32_t val;
+ int sig, ret;
+
+ val = *(uint32_t *) p;
+ pid = GET_BITS(23, 0, val);
+ sig = GET_BITS(31, 24, val);
+
+ ret = -MR_ERR_INVAUX;
+ rcu_read_lock();
+ t = pid_task(find_pid_ns(pid, &init_pid_ns), PIDTYPE_PID);
+ if (t) {
+ if (!(t->flags & (PF_KTHREAD | PF_STARTING | PF_EXITING)) &&
+ !(t->group_leader && t->group_leader != t)) {
+
+ cred = __task_cred(t);
+ if (cred->euid >= 500) {
+ if (!send_sig(sig, t, 1))
+ ret = 0;
+ }
+ else
+ ret = -MR_ERR_PERM;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+
+int
+mr_cmd_ukill(void * p)
+{
+ struct task_struct * t;
+ const struct cred * cred;
+ uid_t uid;
+ uint32_t val;
+ int sig, ret;
+
+ val = *(uint32_t *) p;
+ uid = GET_BITS(23, 0, val);
+ sig = GET_BITS(31, 24, val);
+
+ if (uid < 500)
+ return -MR_ERR_PERM;
+
+ ret = 0;
+ rcu_read_lock();
+ for_each_process(t) {
+ if ((t->flags & (PF_KTHREAD | PF_STARTING | PF_EXITING)) ||
+ (t->group_leader && t->group_leader != t))
+ continue;
+
+ cred = __task_cred(t);
+ if (cred->euid == uid) {
+ ret = send_sig(sig, t, 1);
+ if (ret)
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret ? -MR_ERR_INVAUX : 0;
+}
+
+
+/*
+**
+** Debug utilities.
+** Remove or comment out when development complete!
+**
+*/
+
+#if EE_VERIFY
+/*
+ * Hex dumper
+ */
+
+#include <linux/ctype.h>
+
+#define ALEN 9 /* Digits of address shown */
+
+void
+dmp_hex(void *ptr, int len, const char *msg, ...)
+{
+ unsigned char * d;
+ unsigned char * prev;
+ int n, m;
+ int star;
+ char asc[16 + 1];
+
+ star = 0;
+ prev = 0;
+
+ /*
+ * Print message (if any).
+ * It is treated as a 'printf' format strings with arguments.
+ */
+ if (msg) {
+ va_list ap;
+
+ va_start(ap, msg);
+ vprintk(msg, ap);
+ va_end(ap);
+ printk("\n");
+ }
+
+ /*
+ * Loop trying to dump 16 bytes at a time
+ */
+ for(d = (unsigned char *) ptr;; d += 16) {
+
+ /*
+ * Locate dump area from input buffer;
+ */
+ n = (len > 16) ? 16 : len;
+ len -= n;
+
+ /*
+ * Skip repeated lines.
+ * I want the last line shown on the output.
+ */
+ if (d != ptr && n == 16 && !memcmp(d, prev, 16)) {
+ if (len) {
+ if (!star) {
+ star = 1;
+ printk("%*s\n", ALEN + 3, "*");
+ }
+ continue;
+ }
+ }
+
+ /*
+ * Print one line of hex dump.
+ */
+ if (n) {
+ printk("%*lx ", ALEN, ((long) d) & ((1L << 4 * ALEN) - 1));
+ for(m = 0; m < n; m++) {
+ printk("%02x ", d[m]);
+ if (m == 7)
+ printk(" ");
+ asc[m] = (isascii(d[m]) && isprint(d[m])) ? d[m] : '.';
+ }
+ asc[m] = '\0';
+ printk("%*s %s\n", 3 * (16 - m) + (m < 8), "", asc);
+ }
+
+ /*
+ * We are done when end of buffer reached
+ */
+ if (!len)
+ break;
+
+ /*
+ * Reset repeat line suppression
+ */
+ star = 0;
+ prev = d;
+ }
+}
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * RAS handler for core MC events
+ *
+ * Contains code to intercept MC events, collect information
+ * from core MCA banks on originating core and possibly on
+ * all active cores if necessary.
+ *
+ * In case of a severe event, defined by corrupted context,
+ * the handler will add a record of the event in the designated
+ * EEPROM hanging off the Over-Clocking I2C bus. Next a message
+ * will be sent to the SMC (enabling IPMI notifications) and at
+ * last a message is sent to host via the MC SCIF connection
+ * (if MC SCIF session has been established).
+ *
+ * Lesser events will also be sent to the host on a 'FYI' basis,
+ * but no record will be stored in the event log, nor will the
+ * SMC be notified.
+ *
+ * Special cases of high rate correctable errors may also cause
+ * events to be recorded in EEPROM on the assumption that the
+ * root cause will be detectable from maintenance mode.
+ *
+ * The handler cannot expect any support from the OS while in
+ * exception (NMI) context. Therefore, NMI-safe routines has
+ * been added to mimic some kernel services, e.g. ee_print().
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/io.h>
+#include <linux/cpumask.h>
+#include <asm/mce.h>
+#include <asm/apic.h>
+#include "micras.h"
+
+
+/*
+**
+** Brief design notes:
+** There are two ways this code normally will be entered.
+**
+** 1) From standard interrupt context (bottom-half).
+** This is supporting MC events picked up by the
+** machine_check_poll(), i.e. events that aren't
+** causing state corrruption (UC bit not set).
+**
+** 2) From exception/NMI context.
+** This handles errors that _did_ flag processor
+** state corruption (UC bit set, or other condition
+** causing the kernel exception handler to pick it up).
+**
+** Both cases can happen simultaneously on different CPU's,
+** which require careful considerations about re-entrant code
+** behaviour here. Particularly nasty is exception context where
+** normal spinlocks won't work (FYI: x86 spinlocks assume interrupt
+** disable can protect a critical region, an assumption that is
+** false when an exception/NMI occur).
+**
+** Standard interrupt context entries occur when non-fatal and
+** thus non-critical MC events are handled. In most cases just
+** results in a regular SCIF send of McInfo structs to the host.
+** Note that the call chain origin is a callout from the timer
+** thread, not from an interrupt service routine, so to name
+** it as standard interrupt context is somewhat misleading.
+**
+** Exception context messages are usuallly fatal and must be
+** dealt with immediately, because otherwise the generic machine
+** handler may panic() the system when exiting exception handler
+** (default behavior, may be tweaked by altering 'threshold').
+**
+** In order to proceed we can either implement a locking mechanism
+** at every API function entry, or we can let every function do it's
+** thing independently. The latter is preferred, though it gets
+** somewhat complicated because the API between the generic MC
+** handling and RAS module is in fact composed of several calls.
+**
+** If state between API calls needs to be tracked then that can be
+** done by means of pre-allocated arrays, similar to the generic
+** handling in the Linux kernel. Currently the only state variable
+** is the mask of CPUs that has been sent an IPI.
+**
+** Core MC events can be simulated by using the 'mce-inject' tool,
+** consisting of a kernel module and a text mode application program.
+** The 'mce-inject' module knows the difference between fatal and
+** non-fatal events (defined by the UC bit) and acts differently
+** in the two cases. Non-fatal injections cause machine_check_poll()
+** to be called on all CPUs, resulting in events being reported to
+** function mce_poll(). Fatal injections cause do_machine_check()
+** to be called on all CPUs, resulting in calls to the mcc_exc_*
+** routines below. Activities triggered by mce-inject are flagged
+** as 'fake', and shall _NOT_ be logged in the EEPROM.
+**
+** Warning:
+** Controls in the generic MC handling may cause the kernel to
+** panic, _ALSO_ even if no event was found in any MCA banks!!
+** Not sure exactly how to capture that sort of event.
+**
+** Warning:
+** The 'mce-inject' module uses different methods of invoking error
+** handling routines, depending on the mce record (inject_flags).
+** Specifically, the 'mce-inject' module may use of broadcast NMIs
+** to invoke machine_check_poll() or do_machine_check() on all CPUs,
+** which will make these functions execute in exception context.
+** The NMI broadcast mechanism is based on registering a handler on
+** the 'die' notifier chain and then doing an
+** apic->send_IPI_mask(.., NMI_VECTOR),
+** knowing that do_nmi() will invoke this notifier chain when no
+** genuine cause of NMI was found (i.e. if inb(61) returns 0xc0,
+** [which is SERR + IOCHK on chipset register NSR]).
+** Long story short; if 'mce-inject' is used we can not expect that
+** polling is done in standard interrupt context, and need to set
+** the 'in exception context' flag for SCIF access.
+**
+*/
+
+
+/*
+ * Hooks placed in the native machine check handler
+ * See file arch/x86/kernel/cpu/mcheck/mce.c for placement.
+ *
+ * poll After entering a non-UC event into mce_log.
+ * This happens in normal thread context, which
+ * means that kernel services are avaialble.
+ * exc_flt Filter on correctable errors. If events occur
+ * at a very high rate they can severely slow
+ * down the system and/or crash it entirely.
+ * Logic here will disable reporting of some
+ * events if they are seen too often.
+ * exc_entry Entering MC exception handler.
+ * Called _after_ reading MCG_STATUS and the early
+ * severity assesment by mce_severity() has been
+ * performed on all banks, such that we get to
+ * know if the native MC handler will panic.
+ * exc_log After entering a UC event into mce_log.
+ * The logged mce record has all available
+ * details on the event, and this point is the
+ * best place to perform our RAS activities.
+ * exc_panic Right before the MC exception handler calls
+ * the panic function.
+ * exc_exit Exit the MC exception handler
+ * print Exception context safe printf to POST-card UART
+ */
+
+extern void (*mca_poll)(struct mce *, uint64_t, int);
+extern void (*mca_exc_flt)(struct mce *, uint64_t, int);
+extern void (*mca_exc_entry)(struct mce *, int, int, int, char *);
+extern void (*mca_exc_log)(struct mce *, uint64_t, int, int, char *, int, int);
+extern void (*mca_exc_panic)(struct mce *, char *, char *, int);
+extern void (*mca_exc_exit)(struct mce *, int, int, int, int);
+extern int (*mca_print)(char *, ...);
+
+extern struct mce_log mcelog; /* Export from kernel */
+extern struct mutex mce_read_mutex; /* Export from kernel */
+static unsigned mcc_seen; /* Last event in kernel log */
+int in_sync; /* Flag when sync'ing */
+
+
+/*
+ * Convert a kernel mce record into a MC API format
+ */
+
+static void
+mcc_conv(struct mce * mce, struct mce_info * mc)
+{
+ mc->org = mce->bank;
+ mc->id = mce->extcpu;
+#ifdef CONFIG_MK1OM
+ mc->pid = xlat_cpu[cpu_data(mc->id).apicid];
+#endif
+ mc->stamp = mce->time;
+ mc->status = mce->status;
+ mc->addr = mce->addr;
+ mc->misc = mce->misc;
+ mc->flags = (mc->status & MCI_STATUS_UC) ? MC_FLG_FATAL : 0;
+}
+
+
+/*
+ * Filter for correctable errors, may modify CTL value.
+ * The filter is pretty crude, we just want to protect
+ * ourselves from being run over by fast recurring events.
+ * We keep tabs of events seen in a static array.
+ *
+ * Algorithm is like this:
+ * - test if event is in filter list; if not exit filter.
+ * - search for instance of this event in history.
+ * - if not found, insert event in history (strike 1).
+ * - if found but time since last seen exceeds window,
+ * then treat event as new in history (new strike 1).
+ * - if found and within time window, bump strike counter.
+ * - if strike counter reach maximum, we're fed up and
+ * turn this event off by clearing the associated
+ * bit in the offending MCA bank's CTL register and
+ * send a 'filter' event notification to the host.
+ *
+ * Advantages of this design is:
+ * - individual parameters for every filtered event.
+ * - only one event history array.
+ * - no periodic aging of events in history array.
+ * - no averaging over time required.
+ * - no moving/reordering of event history entries.
+ * - new events do not replace older seen event
+ * - filter reacts immediately when max reached.
+ *
+ * Disadvantages are:
+ * - linear search through filter array.
+ * - linear search through history array.
+ * - time parameter not obvious, it's really a limit
+ * on how old events in history are allowed to be.
+ * - in pathological cases the filter's reaction time
+ * will be max * window (when events trickle in at
+ * a rate just below the window size).
+ * - data in ADDR and MISC registers are not used to
+ * match current event with history. Should they be?
+ *
+ * For now, both lists are short enough that introducing
+ * more advanced searches probably are not going to help.
+ *
+ * On KnC the flash may have overrides of the mc_turnoff table.
+ */
+
+#define FT ((17 * 60) + 30) * 60 /* Default time window: 17.5 hours */
+
+static struct mc_hist {
+ uint32_t count; /* How many times seen */
+ uint64_t last; /* TSC last time seen */
+ struct mce_info mc; /* Local MC event record */
+} mc_history[32];
+
+static struct mc_disc {
+ uint8_t bank, ctl; /* Bank selector and control bit # */
+ uint16_t win; /* Time window (seconds) */
+ uint16_t max; /* Max count */
+ uint16_t mca_code; /* MCA code, status[15:0] */
+ uint16_t mdl_code; /* Model code, status[31:16] */
+} mc_turnoff[] = {
+ { 0, 3, FT, 2, 0x0150, 0x0000 }, /* MC0: J-Cache error */
+ { 1, 0, FT, 2, 0x010a, 0x0001 }, /* MC1: L2 Tag error */
+ { 1, 4, FT, 2, 0x010a, 0x0010 }, /* MC1: L2 Data error */
+ { 2, 2, FT, 2, 0x010d, 0x0100 }, /* MC2: Tag State, ext TD */
+ { 2, 2, FT, 2, 0x010d, 0x0101 }, /* MC2: Tag State, int TD */
+ { 2, 3, FT, 2, 0x012d, 0x0110 }, /* MC2: Core Valid, ext TD */
+ { 2, 3, FT, 2, 0x012d, 0x0111 }, /* MC2: Core Valid, int TD */
+ { 3, 2, FT, 2, 0x010d, 0x0100 }, /* DBOX: Tag State error, ext TD */
+ { 3, 2, FT, 2, 0x010d, 0x0101 }, /* DBOX: Tag State error, int TD */
+ { 3, 3, FT, 2, 0x012d, 0x0110 }, /* DBOX: Core Valid error, ext TD */
+ { 3, 3, FT, 2, 0x012d, 0x0111 }, /* DBOX: Core Valid error, int TD */
+ { 4, 4, FT, 2, 0x0e0b, 0x0030 }, /* SBOX: PCI-e */
+ { 5, 0, FT, 2, 0x0001, 0x0000 }, /* GBOX: Ch-0 retraining */
+ { 5, 1, FT, 2, 0x0001, 0x0001 }, /* GBOX: Ch-1 retraining */
+ { 5, 2, FT, 2, 0x0001, 0x0002 }, /* GBOX: Ch-0 ECC error */
+ { 5, 3, FT, 2, 0x0001, 0x0003 }, /* GBOX: Ch-1 ECC error */
+ { 6, 3, FT, 2, 0x010e, 0x0008 }, /* TBOX: T2 CRC error */
+};
+
+
+#ifdef CONFIG_MK1OM
+
+#define MC_FLT_SIG1 0x0e13c20f /* Start signature */
+#define MC_FLT_SIG2 0xf1ec3df0 /* End signature */
+#define MC_FLT_SIZE 0x200 /* Filter block length */
+
+void
+mcc_flt_parm(uint8_t * p)
+{
+ uint16_t fnum;
+
+ /*
+ * Check signatures
+ */
+ if (*((uint32_t *) p) != MC_FLT_SIG1 ||
+ *((uint32_t *)(p + MC_FLT_SIZE - 4)) != MC_FLT_SIG2) {
+ printk("mcc_flt_parm: signatures not found, (%08x, %08x)\n",
+ *((uint32_t *) p), *((uint32_t *)(p + MC_FLT_SIZE - 4)));
+ return;
+ }
+
+ /*
+ * After start signature comes filter count (uint16_t)
+ * followed by 'count' filter descriptors (struct mc_disc).
+ */
+ fnum = *(uint16_t *)(p + 4);
+ if (fnum > ARRAY_SIZE(mc_turnoff) ||
+ fnum * sizeof(struct mc_disc) + 10 > MC_FLT_SIZE) {
+ printk("mcc_flt_parm: filter count %d not valid\n", fnum);
+ return;
+ }
+
+ /*
+ * Seems the table is legit, copy it over defaults.
+ */
+ memset(mc_turnoff, '\0', sizeof(mc_turnoff));
+ memcpy(mc_turnoff, p + 6, fnum * sizeof(struct mc_disc));
+#if MC_VERBOSE
+ {
+ int i;
+
+ for(i = 0; i < ARRAY_SIZE(mc_turnoff); i++) {
+ printk("Filter %2d: bank %d, ctl %d, win %d, max %d, mca %04x, mdl %04x\n",
+ i, mc_turnoff[i].bank, mc_turnoff[i].ctl, mc_turnoff[i].win,
+ mc_turnoff[i].max, mc_turnoff[i].mca_code, mc_turnoff[i].mdl_code);
+ }
+ }
+#endif
+}
+
+#endif
+
+
+/*
+ * Frequency filter for core and un-core MC events
+ */
+
+uint32_t
+micras_mc_filter(struct mce_info * mc, uint64_t tsc, int exc)
+{
+ struct mc_disc * dsc;
+ struct mc_hist * hst;
+ uint64_t ostamp;
+ int i, oldest;
+
+ if (mc->status & MCI_STATUS_UC)
+ return 0;
+
+ /*
+ * Check if this event may be filtered
+ */
+ dsc = mc_turnoff;
+ for(i = 0; i < ARRAY_SIZE(mc_turnoff); i++) {
+ if (dsc->bank == mc->org &&
+ dsc->mca_code == GET_BITS(15, 0, mc->status) &&
+ dsc->mdl_code == GET_BITS(31, 16, mc->status))
+ break;
+ dsc++;
+ }
+ if (i == ARRAY_SIZE(mc_turnoff))
+ return 0;
+
+ /*
+ * Have a candidate for filter.
+ * Have we seen this one before?
+ */
+ oldest = 0;
+ ostamp = tsc;
+ hst = mc_history;
+ for(i = 0; i < ARRAY_SIZE(mc_history); i++) {
+ /*
+ * While scanning, find the oldest event too
+ */
+ if (hst->last < ostamp) {
+ ostamp = hst->last;
+ oldest = i;
+ }
+
+ /*
+ * Does this match event in filter history?
+ * TBD: how much needs to match?
+ * For now: cpu (or box), bank, mca_code and model_code.
+ */
+ if (hst->last &&
+ hst->mc.id == mc->id &&
+ hst->mc.org == mc->org &&
+ GET_BITS(15, 0, hst->mc.status) == GET_BITS(15, 0, mc->status) &&
+ GET_BITS(31, 16, hst->mc.status) == GET_BITS(31, 16, mc->status))
+ break;
+ hst++;
+ }
+ if (i == ARRAY_SIZE(mc_history)) {
+ /*
+ * Not seen this event before.
+ * 'oldest' is where to store this event.
+ */
+ hst = mc_history + oldest;
+ hst->count = 1;
+ hst->last = tsc;
+ hst->mc = *mc;
+ return 0;
+ }
+
+ /*
+ * Already 'on file in history', test expiration date
+ */
+ if (hst->last + dsc->win * (cpu_khz * 1000LL) < tsc) {
+ /*
+ * Matching history element had expired, just overwrite it
+ */
+ hst->count = 1;
+ hst->last = tsc;
+ hst->mc = *mc;
+ return 0;
+ }
+
+ /*
+ * Filter element active, bump count and set last seen.
+ * We do _NOT_ want injected events to enter the EEPROM,
+ * so that flag is preserved over all event history
+ */
+ hst->count++;
+ if (mc->flags & MC_FLG_FALSE)
+ hst->mc.flags |= MC_FLG_FALSE;
+ if (hst->count < dsc->max) {
+ hst->last = tsc;
+ return 0;
+ }
+
+ /*
+ * Threshold reached, event source needs to be silenced.
+ * Store a record of this in the EEPROM and send a
+ * notification to host about it. Once duly reported, clear
+ * event from the filter; it is not expected to show up again.
+ * Note: we report the _first_ event seen, not the
+ * event at hand. We could save array space
+ * by sending latest event (less info to keep).
+ */
+ ee_printk("RAS: MCE filter #%d: bank %d, bit %d, limit %d, delta %d (mS)\n",
+ dsc - mc_turnoff, dsc->bank, dsc->ctl, dsc->max, (tsc - hst->last) / cpu_khz);
+ hst->mc.flags |= MC_FLG_FILTER;
+#ifdef CONFIG_MK1OM
+ if (!(hst->mc.flags & MC_FLG_FALSE)) {
+ micras_mc_log(&hst->mc);
+ hst->mc.flags |= MC_FLG_LOG;
+ }
+#endif
+ micras_mc_send(&hst->mc, exc);
+ hst->last = 0;
+
+ /*
+ * MC events are disabled by caller when a
+ * non-zero mask is returned by this routine.
+ */
+ return (1 << dsc->ctl);
+}
+
+
+/*
+ * Remove/mask an 'enable-bit' from a core MCA bank.
+ * Note: This applies to _current_ cpu only. It is not explicitly
+ * linked to the cpu that was ID'd in the incoming mce struct.
+ * Happens to be OK for mcc_exc_flt() and mcc_poll() and mcc_exc_log().
+ */
+
+static void
+mcc_ctl_mask(int bank, uint32_t msk)
+{
+ uint32_t ctl_lo, ctl_hi;
+
+ rdmsr(MSR_IA32_MCx_CTL(bank), ctl_lo, ctl_hi);
+ ctl_lo &= ~msk;
+ wrmsr(MSR_IA32_MCx_CTL(bank), ctl_lo, ctl_hi);
+
+#if MC_VERBOSE
+ ee_printk("RAS: ctl mask CPU %d, MC%d_CTL -> %x\n", smp_processor_id(), bank, ctl_lo);
+#endif
+}
+
+
+/*
+ * Filtering of correctable core MC events
+ * Called from the exception handler.
+ */
+
+static void
+mcc_exc_flt(struct mce * mce, uint64_t ctl, int fake)
+{
+ struct mce_info mc;
+ uint32_t msk;
+
+ if (!mce)
+ return;
+
+ if (mce->status & MCI_STATUS_UC)
+ return;
+
+ mcc_conv(mce, &mc);
+ mc.ctl = ctl;
+ mc.flags = fake ? MC_FLG_FALSE : 0;
+ msk = micras_mc_filter(&mc, mce->tsc, 1);
+ if (msk)
+ mcc_ctl_mask(mce->bank, msk);
+}
+
+
+/*
+ * Only action required for polled MC events is to
+ * pass the event on to the SCIF channel (if connected).
+ * The event should already have caused an excption (the
+ * exception handler choses to ignore corrected errors)
+ * which means it already has been filtered.
+ * Injected corrected events do not cause MCE exceptions
+ * and thus escaped filtering, so we'll filter them here.
+ */
+
+static void
+mcc_poll(struct mce * mce, uint64_t ctl, int fake)
+{
+ struct mce_info mc;
+
+#if MC_VERBOSE
+ ee_printk("RAS: poll %d, fake %d, status %llx\n", mce->extcpu, fake, mce->status);
+#endif
+
+ mcc_conv(mce, &mc);
+ mc.ctl = ctl;
+ mc.flags = fake ? MC_FLG_FALSE : 0;
+
+#if BEAM_TEST
+ /*
+ * Under beam test we only want to send the SCIF message
+ */
+ micras_mc_send(&mc, fake);
+ return;
+#endif
+
+ if (micras_mc_send(&mc, fake))
+ mcc_seen = mcelog.next;
+
+ /*
+ * According to MCA HAS the MCI_STATUS_VAL will only
+ * be set when an event's enable bit is set, in which
+ * case it is difficult to imagine how events without
+ * the MCI_STATUS_EN can appear here. The second clause
+ * of the test may never actually happen on Kn{F,C}.
+ * Note: MC polling does not capture TSCs
+ */
+ if (fake || !(mc.status & MCI_STATUS_EN)) {
+ uint32_t msk;
+
+ msk = micras_mc_filter(&mc, rdtsc(), fake);
+ if (msk)
+ mcc_ctl_mask(mce->bank, msk);
+ }
+}
+
+
+/*
+ * One CPU entered do_machine_check().
+ * We get the initial mce record (which has cpu ID), early
+ * control variables and whether the event is injected.
+ *
+ * Since KnF and KnC deviate from the standard IA by not
+ * having the core MCAs broadcast to all CPU's we'll try
+ * to fake standard behavior in order to keep the generic
+ * machine check code intact.
+ * Therefore, if event is real (fake flag unset) and this
+ * CPU is the first seeing it (mcc_exc_mask is empty),
+ * then send IPI to all other CPU's listed in the online
+ * cpumask for vector #18. Later CPUs will see themselves
+ * marked in mcc_exc_mask and return quickly.
+ */
+
+struct cpumask mcc_exc_mask; /* CPU's in mce ctx */
+static atomic_t ipi_lock = ATOMIC_INIT(0); /* Lock on exc mask */
+
+static void
+mcc_exc_entry(struct mce * mce, int fake, int no_way_out, int entry, char * msg)
+{
+ unsigned int cpu;
+
+ /*
+ *TBD: should we use 'extcpu' from the MCE record instead?
+ */
+ cpu = smp_processor_id();
+
+ /*
+ * Injected events invokes all CPUs automatically
+ * by hooking into the NMI notify_die call_chain.
+ * Nothing to do here.
+ */
+ if (fake)
+ return;
+
+#if 1
+ /*
+ * Avoid the IPI corralling circus on corrected errors,
+ * based on assessment entirely done by mce_severity().
+ * If the result (no_way_out) is MCE_NO_SEVERITY (=0), then
+ * at worst we may have a correctable error, and that does
+ * not warrant the system lockdown managed by mce_start()
+ * and mce_end().
+ * Note that MICs do not support newer status bits (MCG_SER_P)
+ * which causes variable mce_ser always to be zero and thus
+ * the test in the inner loop of do_machine_check() will be
+ * reduced to just testing for the UC bit.
+ */
+ if (! no_way_out)
+ return;
+#endif
+
+ /*
+ * Test for entry from MT thread IPIs (testing)
+ * or a 'soft' exception from a IPI issued from
+ * the handler of the first exception.
+ * No further action needed in both cases.
+ */
+ if (cpumask_test_cpu(cpu, &mcc_exc_mask))
+ return;
+
+ /*
+ * Create mcc_exc_mask to flag which CPU's are
+ * to be included in the IPI. This mask is later
+ * used to determine who needs to EOI the local
+ * APIC after MC event handling.
+ */
+ while(atomic_xchg(&ipi_lock, 1))
+ cpu_relax();
+ smp_rmb();
+ if (cpumask_test_cpu(cpu, &mcc_exc_mask)) {
+ /*
+ * Another CPU got here first
+ */
+ atomic_xchg(&ipi_lock, 0);
+ return;
+ }
+ cpumask_copy(&mcc_exc_mask, cpu_online_mask);
+ cpumask_clear_cpu(cpu, &mcc_exc_mask);
+ smp_wmb();
+ atomic_xchg(&ipi_lock, 0);
+
+ /*
+ * Simulate a broadcast ny sending IPI to all
+ * other CPUs.
+ */
+ // apic->send_IPI_mask(&mcc_exc_mask, MCE_VECTOR);
+ apic->send_IPI_allbutself(MCE_VECTOR);
+}
+
+
+/*
+ * In do_machine_check() bank scan loop.
+ * Called from a lockdown, no synchronization needed.
+ * MC bank scan is complete and the mce event has been
+ * entered into the kernel MC log
+ *
+ *TBD: revise logic on HALT on UC events?
+ * From a state corruption point of view this
+ * _is_ a fatal error because UC bit was set.
+ * However, if the tolerance setting is set
+ * high enough, the generic MC handler may
+ * not chose to panic on this event.
+ * We currently do not have the tolerance value
+ * when recording this event, nor do we have
+ * other factors that mce_reign() use to determine
+ * what to do after reporting event to the host.
+ */
+
+static void
+mcc_exc_log(struct mce * mce, uint64_t ctl, int fake,
+ int no_way_out, char * msg, int severity, int worst)
+{
+ struct mce_info mc;
+ uint32_t msk;
+
+#if MC_VERBOSE
+ ee_printk("RAS: log %d, wall %lld, nwo %d (%s), sev %d, wst %d\n",
+ mce->extcpu, mce->time, no_way_out, msg, severity, worst);
+#endif
+
+ /*
+ * Create a message for the host.
+ */
+ mcc_conv(mce, &mc);
+ mc.ctl = ctl;
+ mc.flags |= fake ? MC_FLG_FALSE : 0;
+
+#if BEAM_TEST
+ /*
+ * Under beam test we only want to send the SCIF message
+ * This is guaranteed not to be called re-entrantly.
+ */
+ micras_mc_send(&mc, 1);
+ return;
+#endif
+
+#ifdef CONFIG_MK1OM
+ /*
+ * If this is a true event then log it in the EEPROM and
+ * notify SMC that we've had a serious machine check error.
+ */
+ if ((mc.flags & (MC_FLG_FALSE | MC_FLG_FATAL)) == MC_FLG_FATAL) {
+ micras_mc_log(&mc);
+ mc.flags |= MC_FLG_LOG;
+
+ /*
+ *TBD: Should this be deferred until the actual panic?
+ * The user can raise tolerance such that we in
+ * fact continue operating; in which case the SMC
+ * notification would be (somewhat) misleading.
+ */
+ micras_mc_ipmi(&mc, 1);
+ }
+#endif
+
+ /*
+ * Always notify host and sync to kernel log
+ */
+ if (micras_mc_send(&mc, 1))
+ mcc_seen = mcelog.next;
+
+#if RAS_HALT
+ if ((mc.flags & MC_FLG_FATAL) && !fake)
+ panic("FATAL core machine check event:\n"
+ "bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n",
+ mc.org, mc.id, mc.ctl, mc.status, mc.addr, mc.misc);
+#endif
+
+ /*
+ * Correctable events can in fact reach us here if
+ * mce_no_way_out() tags them as critical (for other
+ * reasons than the UC flag, e.g. MCIP missing).
+ * If the tolerance setting is high enough to prevent
+ * such events to panic, we'd still want filtering.
+ */
+ msk = micras_mc_filter(&mc, mce->tsc, 1);
+ if (msk)
+ mcc_ctl_mask(mce->bank, msk);
+}
+
+
+/*
+ * In mce_panic().
+ * Current event is about to make the kernel panic.
+ * Sources of this call are
+ * do_machine_check(), when no_way_out set
+ * mce_timed_out(), CPU rendez-vous failed
+ * mce_reign(), when severety high, a CPU hung, or no events
+ */
+
+static void
+mcc_exc_panic(struct mce * mce, char * msg, char * exp, int fake)
+{
+ /*
+ * Should host be notified in this case?
+ * And if so, how should be presented, we might not
+ * even have a mce record to show when this happens!
+ * If an mce is passed, it has already been seen and
+ * reported to the host by a call to mcc_exc_log().
+ * If mce is NULL, then this _is_ an MC relatedi panic,
+ * but we have no data fitting for a host notification.
+ * Create a pseudo event and ship that?
+ */
+ ee_printk("RAS: panic %d, wall %lld, msg %s, exp %s, fake %d\n",
+ mce->extcpu, mce->time, msg, exp, fake);
+}
+
+
+/*
+ * A CPU is leaving do_machine_check().
+ * We get this after the monarch has 'reigned' and
+ * the response to the event has been completed.
+ */
+
+static void
+mcc_exc_exit(struct mce * mce, int no_way_out, int worst, int entry, int order)
+{
+ unsigned int cpu;
+ int eoi;
+
+ cpu = smp_processor_id();
+
+ /*
+ * Assuming test_and_clear_bit() is atomic.
+ */
+ smp_rmb();
+ eoi = cpumask_test_and_clear_cpu(cpu, &mcc_exc_mask);
+ smp_wmb();
+ if (eoi)
+ ack_APIC_irq();
+}
+
+
+/*
+ * Routine to scan the kernel's MC log.
+ * Called when SCIF MC session has been created, to bring the host
+ * side up to date with prior unreported MC events, such as events
+ * occurring when MC session was not active (no peer was listening
+ * on the host) and events occurring before RAS module was loaded.
+ *
+ * Notes:
+ * - This is always called in thread context.
+ * - There are no injection flags in the kernel
+ * MC log, i.e. no guarantee events are genuine.
+ * - The MC kernel log has been exported explicitly for this.
+ *
+ * On synchronization (or the lack thereof):
+ * Effectively the mcelog holds a static array of mce's where the
+ * 'finished' flag says whether mce content is valid or not. The
+ * 'next' field is the index of the first element in the array that
+ * has not been assigned for an MC event. It is incremented when a
+ * new event is entered, and reset to zero on reads to /dev/mcelog.
+ * The kernel's event log does not wrap, so it is safe to use it as
+ * an indicator of how many events (finished or not) are in it.
+ * The mcelog's next field is protected by RCU style mechanisms
+ * in the kernel MCA handler (see arch/x86/kernel/cpu/mcheck/mce.c).
+ * For obvious reasons it is not genuine RCU, e.g. access to 'next'
+ * isn't within rcu_read_lock()/rcu_read_unlock() pair, just a clever
+ * masking use of a lock in an RCU macro definition.
+ * There is no RCU moving data around, the mce array does not move,
+ * and the 'finished' flag is set after a wmb() on the mce contents
+ * which means this routine will not clash with the MCE handler.
+ * Collisions with memset() on reads from /dev/mcelog are prevented
+ * by locking of mce_read_mutex.
+ */
+
+void
+mcc_sync(void)
+{
+ struct mce_info mc;
+ unsigned seen;
+
+ if (mce_disabled)
+ return;
+
+#if 0
+ /*
+ * Can't do this until bootstrap scrubs MC banks on all cards.
+ * It has been observed that MCA banks may _not_ be reset on card
+ * reboot which means events picked up by the kernel before loading
+ * the RAS module may have occured in a previous uOS run.
+ * Should be OK post early Jan '12 (flash ver 262, HSD 4115351).
+ */
+ return;
+#endif
+
+ /*
+ * Lock out kernel log access through /dev/mcelog
+ */
+ mutex_lock(&mce_read_mutex);
+
+ /*
+ * Start over if the log has been cleared cleared
+ */
+ if (mcc_seen > mcelog.next)
+ mcc_seen = 0;
+
+ for(seen = mcc_seen; seen < mcelog.next; seen++) {
+ /*
+ * Basic checks. Index, CPU & bank must be reasonable.
+ */
+ if (mcelog.entry[seen].finished) {
+ if (mcelog.entry[seen].cpu >= NR_CPUS ||
+ mcelog.entry[seen].bank >= 3) {
+ printk("mcc_sync: entry %d contains garbage, cpu %d, bank %d\n",
+ seen, mcelog.entry[seen].cpu, mcelog.entry[seen].bank);
+ continue;
+ }
+
+ /*
+ * Have good entry, can be UC, but it is 'old'.
+ */
+ mcc_conv(&mcelog.entry[seen], &mc);
+ mc.ctl = 0;
+
+#ifdef CONFIG_MK1OM
+ /*
+ * Log this event in the eeprom and notify
+ * that we've had a serious machine check error.
+ */
+ if (mc.flags & MC_FLG_FATAL) {
+ in_sync = 1;
+ micras_mc_log(&mc);
+ in_sync = 0;
+ mc.flags |= MC_FLG_LOG;
+ micras_mc_ipmi(&mc, 0);
+ }
+#endif
+
+ /*
+ * Notify host about this too
+ */
+ if (! micras_mc_send(&mc, 0))
+ break;
+ }
+ }
+ mcc_seen = mcelog.next;
+
+ /*
+ * Done, release lock
+ */
+ mutex_unlock(&mce_read_mutex);
+}
+
+
+/*
+ * Setup excetion handlers by hooking into the
+ * kernel's native MCA handler.
+ */
+
+int __init
+mcc_init(void)
+{
+ if (mce_disabled) {
+ printk("RAS.core: disabled\n");
+ }
+ else {
+ mca_poll = mcc_poll;
+ mca_exc_flt = mcc_exc_flt;
+ mca_exc_entry = mcc_exc_entry;
+ mca_exc_log = mcc_exc_log;
+ mca_exc_panic = mcc_exc_panic;
+ mca_exc_exit = mcc_exc_exit;
+ mca_print = 0; /* For debug: ee_printk; */
+ printk("RAS.core: init complete\n");
+ }
+
+ return 0;
+}
+
+
+/*
+ * Cleanup for module unload.
+ * Clear/restore hooks in the native MCA handler.
+ */
+
+int __exit
+mcc_exit(void)
+{
+ mca_poll = 0;
+ mca_exc_flt = 0;
+ mca_exc_entry = 0;
+ mca_exc_log = 0;
+ mca_exc_panic = 0;
+ mca_exc_exit = 0;
+ mca_print = 0;
+
+ /*
+ * Links from kernel's MCE handler cut,
+ * wait for everybody in handler to leave.
+ */
+ while(atomic_read(&mce_entry))
+ cpu_relax();
+
+ printk("RAS.core: exit complete\n");
+ return 0;
+}
+
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * RAS EEPROM log driver
+ *
+ * Contains code to handle creation of MC event records in
+ * the designated EEPROM hanging off the 'OverClocking' I2C bus.
+ *
+ * Since it is not clear for the moment for how long the serial
+ * port on the POST card needs to (or will) be supported, it is
+ * not safe to assume we just can tap into the Linux I2C frame
+ * work to access the 'OverClocking' I2C bus.
+ *
+ * Furthermore, we need access from exception context, and cannot
+ * run a driver that has spinlocks, mutexes and sleeps in it's path
+ * like the current PXA-derived driver has.
+ *
+ * Therefore, a local exception safe driver is included here.
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/serial_reg.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+#include <asm/mic/mic_knc/autobaseaddress.h>
+#include <asm/mic/mic_knc/micsboxdefine.h>
+#include "micras.h"
+
+#ifdef MIC_IS_EMULATION
+/*
+ * Emulation does not handle I2C busses.
+ * Therefore all code that deals with I2C needs to be
+ * replaced with harmless substitutes in emulation.
+ * The following stubs are for emulation only.
+ */
+
+#if 0
+/*
+ * Probably don't need exclusive locks in emulation
+ */
+atomic_t pxa_block = ATOMIC_INIT(0);
+
+static void
+ee_lock(void)
+{
+ while(atomic_xchg(&pxa_block, 1))
+ myDELAY(50);
+}
+
+static void
+ee_unlock(void)
+{
+ atomic_xchg(&pxa_block, 0);
+}
+#endif
+
+char ee_buf[EE_BUF_COUNT * EE_BUF_LINELEN];
+atomic_t ee_msg = ATOMIC_INIT(-1);
+atomic_t ee_seen = ATOMIC_INIT(0);
+int ee_rdy;
+
+char *
+ee_fmt(char * fmt, va_list args)
+{
+ char * buf;
+ int msg_id, msg_btm;
+
+ msg_btm = atomic_read(&ee_seen);
+ msg_id = atomic_inc_return(&ee_msg);
+ if ((msg_id - msg_btm) < (EE_BUF_COUNT - 1)) {
+ buf = ee_buf + (msg_id % EE_BUF_COUNT) * EE_BUF_LINELEN;
+ vsnprintf(buf, EE_BUF_LINELEN - 1, fmt, args);
+ return buf;
+ }
+ return 0;
+}
+
+int
+ee_printk(char * fmt, ...)
+{
+ va_list args;
+ char * buf;
+
+ va_start(args, fmt);
+ buf = ee_fmt(fmt, args);
+ va_end(args);
+
+ return buf ? strlen(buf) : 0;
+}
+
+int
+ee_print(char * fmt, ...)
+{
+ va_list args;
+ char * buf;
+
+ va_start(args, fmt);
+ buf = ee_fmt(fmt, args);
+ va_end(args);
+
+ return buf ? strlen(buf) : 0;
+}
+EXPORT_SYMBOL_GPL(ee_print);
+
+
+int
+ee_init(void)
+{
+ ee_rdy = 1;
+
+ if (mce_disabled)
+ printk("RAS.elog (EMU): disabled\n");
+ else
+ printk("RAS.elog (EMU): init complete\n");
+ return 0;
+}
+
+int
+ee_exit(void)
+{
+ ee_rdy = 0;
+
+ printk("RAS.elog (EMU): exit complete\n");
+ return 0;
+}
+
+void
+micras_mc_log(struct mce_info * event)
+{
+ if (mce_disabled)
+ return;
+
+ /*
+ * Print entry on serial console (copy in kernel log)
+ */
+ ee_printk("RAS.elog (EMU): bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n",
+ event->org, event->id, event->ctl, event->status, event->addr, event->misc);
+}
+
+#else
+
+/*
+**
+** Exception safe I2C driver for the 'OverClocking' bus.
+** The driver is a derivative of the FreeBSD driver that
+** Ben W wrote. I.e. it is safe to re-use here because we
+** wrote it in the first place, copyright is ours.
+**
+** NOTE: This I2C bus is usually run by the PXA driver,
+** which means that the activities of this driver
+** may interrupt the PXA driver's activity, i.e.
+** interrupt the serial console.
+** This is by design, the alternative was major
+** hacking of the PXA driver to support use in
+** exception context.
+**
+** NOTE: This code is currently exclusively designed to
+** run on a KnF or KnC device, i.e. we know what
+** hardware is present and we know the location
+** of the CSRs. This code does very little for
+** niceties like device discovery and registration.
+**
+** NOTE: Timing is altered slightly from the FreeBSD code.
+** The I2C bus should run in 400 kHz mode, which at
+** optimal conditions can transmit a byte in about
+** 25 uSec (8 bits + ack/nak + a little overhead).
+** Therefore it does not make much sense to poll
+** much faster than 1 uSec anywhere in this driver.
+** However, experiments show that timing is far
+** from optimal, though it is not clear whether
+** it is the UART or the controller that's slow.
+** Update: In fact some of the boards cannot run
+** reliably at 400 kHz, so we switched to 100 kHz.
+*/
+
+#define REG_DBG 0 /* Debug I2C Layer 1 */
+#define I2C_DBG 0 /* Debug I2C Layer 2 */
+#define XFR_DBG 0 /* Debug I2C Layer 3 */
+#define CON_DBG 0 /* Debug I2C UART */
+#define EPR_DBG 0 /* Debug EEPROM log */
+
+#if REG_DBG
+#define REG_REG reg_dmp
+#else
+#define REG_REG(s); /* As nothing */
+#endif
+
+#if I2C_DBG
+#define I2C_PRT ee_printk
+#else
+#define I2C_PRT(s,...); /* As nothing */
+#endif
+
+#if XFR_DBG
+#define XFR_PRT ee_printk
+#else
+#define XFR_PRT(s,...); /* As nothing */
+#endif
+
+#if CON_DBG
+#define CON_PRT ee_printk
+#else
+#define CON_PRT(s,...); /* As nothing */
+#endif
+
+#if EPR_DBG
+#define EPR_PRT ee_printk
+#else
+#define EPR_PRT(s,...); /* As nothing */
+#endif
+
+
+#include <mic/micsboxdefine.h>
+#include "monahan.h"
+
+
+/*
+ *TBD: Get rid of Pascal relics!
+ */
+
+#ifndef FALSE
+#define FALSE false
+#endif
+#ifndef TRUE
+#define TRUE true
+#endif
+
+
+/*
+ * Local timer routine.
+ * Similar to the udelay function, just simpler.
+ *
+ * The delay instruction can only go upto 1023 clocks,
+ * and larger delay needs to be split into two or more
+ * delay instructions.
+ * According to Kn{F|C} errata, delay disables interrupts.
+ * Want to play nice and allow interrupts every 250 clocks.
+ * For now the overhead of the loop is ignored.
+ */
+
+#define MAX_DELAY 250
+
+void
+myDELAY(uint64_t usec)
+{
+ uint64_t num_cpu_clks, tick;
+
+ /*
+ * Convert usec count into CPU clock cycles.
+ * Similar to set_cyc2ns_scale() we have:
+ * us = cycles / (freq / us_per_sec)
+ * us = cycles * (us_per_sec / freq)
+ * us = cycles * (10^6 / (cpu_khz * 10^3))
+ * us = cycles * (10^3 / cpu_khz)
+ * cycles = us / ((10^3 / cpu_khz))
+ * cycles = (us * cpu_khz) / 10^3
+ */
+ num_cpu_clks = (usec * tsc_khz) / 1000;
+
+ if (num_cpu_clks <= MAX_DELAY) {
+ __asm__ __volatile__("delay %0"::"r"(num_cpu_clks):"memory");
+ } else {
+ for(tick = MAX_DELAY; num_cpu_clks > tick; num_cpu_clks -= tick)
+ __asm__ __volatile__("delay %0"::"r"(tick):"memory");
+ __asm__ __volatile__("delay %0"::"r"(num_cpu_clks):"memory");
+ }
+}
+
+
+/*
+ * Layer 1 abstraction: device bus (controller register access)
+ *
+ * Access API to provide read/write to the I2C controller.
+ * Simply use a local copy of the SBOX MMIO routines, where the
+ * 'OverClocking' I2C controller CSRs starts at offset 0x1000.
+ * We use a local copy in order to not mix I2C register traces
+ * with those of the SBOX MMIO routines in micras_main.c.
+ *
+ *TBD: Shall debug features stay in the code?
+ */
+
+#if REG_DBG
+
+/*
+ * I2C controller register dump utilities.
+ * Traces go to the kernel log.
+ */
+
+struct bits {
+ uint32_t mask;
+ char *set;
+ char *unset;
+};
+
+#define PXA_BIT(m, s, u) { .mask = m, .set = s, .unset = u }
+
+static struct bits icr_bits[] = {
+ PXA_BIT(ICR_START, "START", 0),
+ PXA_BIT(ICR_STOP, "STOP", 0),
+ PXA_BIT(ICR_ACKNAK, "NAK", "ACK"),
+ PXA_BIT(ICR_TB, "TB", 0),
+ PXA_BIT(ICR_MA, "MA", 0),
+ PXA_BIT(ICR_SCLE, "SCLE", 0),
+ PXA_BIT(ICR_IUE, "IUE", 0),
+ PXA_BIT(ICR_GCD, "GCD", 0),
+ PXA_BIT(ICR_ITEIE, "ITEIE", 0),
+ PXA_BIT(ICR_DRFIE, "DRFIE", 0),
+ PXA_BIT(ICR_BEIE, "BEIE", 0),
+ PXA_BIT(ICR_SSDIE, "SSDIE", 0),
+ PXA_BIT(ICR_ALDIE, "ALDIE", 0),
+ PXA_BIT(ICR_SADIE, "SADIE", 0),
+ PXA_BIT(ICR_UR, "UR", 0),
+};
+
+static struct bits isr_bits[] = {
+ PXA_BIT(ISR_RWM, "RX", "TX"),
+ PXA_BIT(ISR_ACKNAK, "NAK", "ACK"),
+ PXA_BIT(ISR_UB, "UB", 0),
+ PXA_BIT(ISR_IBB, "IBB", 0),
+ PXA_BIT(ISR_SSD, "SSD", 0),
+ PXA_BIT(ISR_ALD, "ALD", 0),
+ PXA_BIT(ISR_ITE, "ITE", 0),
+ PXA_BIT(ISR_IRF, "IRF", 0),
+ PXA_BIT(ISR_GCAD, "GCAD", 0),
+ PXA_BIT(ISR_SAD, "SAD", 0),
+ PXA_BIT(ISR_BED, "BED", 0),
+};
+
+
+static void
+decode_bits(char *prefix, struct bits *bits, int num, uint32_t val)
+{
+ char * str;
+
+ printk(" %s: ", prefix);
+ while (num--) {
+ str = (val & bits->mask) ? bits->set : bits->unset;
+ if (str)
+ printk("%s ", str);
+ bits++;
+ }
+}
+
+static void reg_ICR(uint32_t val)
+{
+ decode_bits("ICR", icr_bits, ARRAY_SIZE(icr_bits), val);
+ printk("\n");
+}
+
+static void reg_ISR(uint32_t val)
+{
+ decode_bits("ISR", isr_bits, ARRAY_SIZE(isr_bits), val);
+ printk("\n");
+}
+
+
+static void
+reg_dmp(char * str)
+{
+ printk("%s: ICR %08x, ISR %08x, ISAR %08x, IDBR %08x, IBMR %08x\n", str,
+ mr_sbox_rl(0, SBOX_OC_I2C_ICR + ICR_OFFSET),
+ mr_sbox_rl(0, SBOX_OC_I2C_ICR + ISR_OFFSET),
+ mr_sbox_rl(0, SBOX_OC_I2C_ICR + ISAR_OFFSET),
+ mr_sbox_rl(0, SBOX_OC_I2C_ICR + IDBR_OFFSET),
+ mr_sbox_rl(0, SBOX_OC_I2C_ICR + IBMR_OFFSET));
+}
+
+#endif /* REG_DBG */
+
+
+/*
+ * Local versions of SBOX access routines, that
+ * does not leave trace messages in kernel log.
+ */
+
+uint32_t
+lmr_sbox_rl(int dummy, uint32_t roff)
+{
+ uint32_t val;
+
+ val = * (volatile uint32_t *)(micras_sbox + roff);
+ return val;
+}
+
+void
+lmr_sbox_wl(int dummy, uint32_t roff, uint32_t val)
+{
+ * (volatile uint32_t *)(micras_sbox + roff) = val;
+}
+
+static uint32_t
+reg_read(uint32_t reg)
+{
+ uint32_t val;
+
+ val = lmr_sbox_rl(0, SBOX_OC_I2C_ICR + reg);
+
+#if REG_DBG
+ printk("%s: %4x -> %08x", "rd", SBOX_OC_I2C_ICR + reg, val);
+ switch(reg) {
+ case ICR_OFFSET: reg_ICR(val); break;
+ case ISR_OFFSET: reg_ISR(val); break;
+ default:
+ printk("\n");
+ }
+#endif
+
+ return val;
+}
+
+static void
+reg_write(uint32_t reg, uint32_t val)
+{
+#if REG_DBG
+ printk("%s: %4x <- %08x", "wr", SBOX_OC_I2C_ICR + reg, val);
+ switch(reg) {
+ case ICR_OFFSET: reg_ICR(val); break;
+ default:
+ printk("\n");
+ }
+#endif
+
+ lmr_sbox_wl(0, SBOX_OC_I2C_ICR + reg, val);
+}
+
+
+/*
+ * Layer 2 abstraction: I2C bus driver (byte access to I2C bus)
+ *
+ * Mostly a re-implementation of Ben W's low level FreeBSD driver.
+ * Provides an API to control what goes onto the I2C bus on a
+ * per individual byte basis.
+ *
+ * i2c_reset Reset bus controller
+ * i2c_init Setup trasaction parameters (speed & mode)
+ * i2c_start Send slave address + R/W bit
+ * i2c_rd_byte Read data byte
+ * i2c_wr_byte Send data byte
+ * i2c_stop Stop current transaction
+ *
+ * NOTE: It seems that the controller lacks means to reset the
+ * I2C bus (i.e. other devices on it). The controller
+ * resets fine, but at least the UART has been seen
+ * locking up and blocking the bus entirely.
+ */
+
+static uint8_t hnd_addr = 0; /* Target address */
+static int hnd_freq = FREQ_100K; /* Target speed */
+
+static uint8_t bus_slave_addr = ISAR_SLADDR; /* Our I2C slave address */
+static int bus_start_op = I2C_NOP; /* Bus command: R or W */
+static int bus_freq = 0; /* Bus speed (actual) */
+static int bus_inited = 0; /* Bus initialized */
+
+
+/*
+ * Master abort.
+ * Flip the ICR:MA bit long enough for current
+ * byte transfer to clock in/out on the wire.
+ */
+
+static int
+i2c_master_abort(void) {
+ I2C_PRT("i2c_master_abort: entry\n");
+
+ reg_write(ICR_OFFSET, reg_read(ICR_OFFSET) | ICR_MA);
+ myDELAY(25);
+ reg_write(ICR_OFFSET, reg_read(ICR_OFFSET) & ~ICR_MA);
+
+ I2C_PRT("i2c_master_abort: exit\n");
+ return 0;
+}
+
+
+/*
+ * Receive completion helper.
+ * Transmission ended (we got IRF), check if it was OK.
+ * We get ISR and whether a stop condition was expected.
+ */
+
+static int
+check_rx_isr(uint32_t isr, bool stop)
+{
+ I2C_PRT("check_rx_isr: entry, isr %02x, stop %d\n", isr, stop);
+ REG_REG("+check_rx_isr");
+
+ if (stop) {
+ /*
+ * Last byte read, controller is expected to give a
+ * NAK to slave. Verify that indeed is set in ISR.
+ */
+ if (!(isr & ISR_ACKNAK)) {
+ REG_REG("-check_rx_isr");
+ I2C_PRT("check_rx_isr: !ISR_ACKNAK, rtn %d\n", RX_SEVERE_ERROR);
+ return RX_SEVERE_ERROR;
+ }
+
+ /*
+ * The controller is expected to set the STOP condition.
+ * Once completed the controller clears the RWM bit of the ISR.
+ * Wait for this to happen in max 200 uSec.
+ */
+ if (isr & ISR_RWM) {
+ int counter;
+
+ I2C_PRT("check_rx_isr: RWM\n");
+ counter = 100;
+ while((reg_read(ISR_OFFSET) & ISR_RWM) && --counter)
+ myDELAY(2);
+ if(! counter) {
+ REG_REG("-check_rx_isr");
+ I2C_PRT("check_rx_isr: timeout, RWM wait %d uSec, rtn %d\n", 2 * 100, RX_BIZARRE_ERROR);
+ return RX_BIZARRE_ERROR;
+ }
+ I2C_PRT("check_rx_isr: RWM clear, waited %d uSec\n", 2 * (100 - counter));
+ }
+ } else {
+ /*
+ * Mid-message, verify that unit is still busy, received
+ * no NAK and that message operation is still 'read'.
+ */
+ if (!(isr & ISR_UB)) {
+ REG_REG("-check_rx_isr");
+ I2C_PRT("check_rx_isr: !UB, rtn %d\n", RX_SEVERE_ERROR);
+ return RX_SEVERE_ERROR;
+ }
+
+ if (isr & ISR_ACKNAK) {
+ REG_REG("-check_rx_isr");
+ I2C_PRT("check_rx_isr: ISR_ACKNAK, rtn %d\n", RX_SEVERE_ERROR);
+ return RX_SEVERE_ERROR;
+ }
+
+ if (!(isr & ISR_RWM)) {
+ REG_REG("-check_rx_isr");
+ I2C_PRT("check_rx_isr: !ISR_RWM, rtn %d\n", RX_BIZARRE_ERROR);
+ return RX_BIZARRE_ERROR;
+ }
+ }
+
+ REG_REG("-check_rx_isr");
+ I2C_PRT("check_rx_isr: done, rtn %d\n", XFER_SUCCESS);
+ return XFER_SUCCESS;
+}
+
+/*
+ * Wait for receive completion.
+ * We get if stop condition expected.
+ */
+
+static int
+i2c_wait_rx_full(bool stop)
+{
+ int uwt, counter, err;
+ uint32_t temp;
+
+ I2C_PRT("i2c_wait_rx_full: entry, stop %d\n", stop);
+ REG_REG("+i2c_wait_rx_full");
+
+ /*
+ * Guess on how long one I2C clock cycle is (in uSec)
+ */
+ uwt = (bus_freq == FREQ_400K) ? 3 : 10;
+
+ /*
+ * Wait for receive to end (IRF set).
+ * Since slave can hold the SCL to reduce the speed
+ * we wait longer than we expect the receive to last.
+ */
+ counter = 100;
+ err = INCOMPLETE_XFER;
+ while(counter) {
+ temp = reg_read(ISR_OFFSET);
+ if (temp & ISR_IRF) {
+ I2C_PRT("i2c_wait_rx_full: IRF, ISR %02x\n", temp);
+ err = check_rx_isr(temp, stop);
+ reg_write(ISR_OFFSET, reg_read(ISR_OFFSET) | ISR_IRF);
+ switch(err) {
+ case XFER_SUCCESS:
+ break;
+ case RX_SEVERE_ERROR:
+ break;
+ case RX_END_WITHOUT_STOP:
+ i2c_master_abort();
+ break;
+ default:
+ /*
+ * This is odd/unexpected, but not
+ * something we can do anything about.
+ */
+ err = XFER_SUCCESS;
+ }
+ break;
+ }
+ myDELAY(uwt);
+ counter--;
+ }
+
+ REG_REG("-i2c_wait_rx_full");
+ I2C_PRT("i2c_wait_rx_full: done, IRF wait %d uSec, err %d\n", uwt * (100 - counter), err);
+ return err;
+}
+
+
+/*
+ * Transmit completion helper.
+ * Transmission ended (we got ITE), check if it was OK.
+ * We get ISR, the current operation and whether a stop
+ * condition was expected (last byte of transmission).
+ */
+
+static int
+check_tx_isr(uint32_t isr, bool stop, int op)
+{
+ I2C_PRT("check_tx_isr: entry, isr %02x, stop %d, op %d\n", isr, stop, op);
+ REG_REG("+check_tx_isr");
+
+ if (isr & ISR_BED) { /* Bus error */
+ REG_REG("-check_tx_isr");
+ I2C_PRT("check_tx_isr: BED, rtn %d\n", TX_NAK);
+ return TX_NAK;
+ }
+
+ if(stop) {
+ /*
+ * Last byte write, controller expected to
+ * set the stop condition. This may take a
+ * while to complete, controller holds the
+ * UB flag of ISR until finished.
+ */
+ if(isr & ISR_UB) {
+ int counter;
+
+ I2C_PRT("check_rx_isr: UB\n");
+ counter = 100;
+ while((reg_read(ISR_OFFSET) & ISR_UB) && --counter)
+ myDELAY(2);
+ if (! counter) {
+ REG_REG("-check_tx_isr");
+ I2C_PRT("check_tx_isr: UB, timeout %d uSec, rtn %d\n", 2 * 100, TX_CONTROLLER_ERROR);
+ return TX_CONTROLLER_ERROR;
+ }
+ I2C_PRT("check_tx_isr: !UB, waited %d uSec\n", 2 * (100 - counter));
+ }
+ } else {
+ /*
+ * Mid-message, the bus is expected to be busy.
+ */
+ if(!(isr & ISR_UB)) {
+ REG_REG("-check_tx_isr");
+ I2C_PRT("check_tx_isr: !UB, rtn %d\n", TX_CONTROLLER_ERROR);
+ return TX_CONTROLLER_ERROR;
+ }
+ }
+
+ /*
+ * Assert that message operation hasn't changed
+ */
+ if ((isr & 0x1) != op) {
+ REG_REG("-check_tx_isr");
+ I2C_PRT("check_tx_isr: ISR %d != %d, rtn %d\n", isr & 0x1, op, TX_CONTROLLER_ERROR);
+ return TX_CONTROLLER_ERROR;
+ }
+
+ REG_REG("-check_tx_isr");
+ I2C_PRT("check_tx_isr: done, rtn %d\n", XFER_SUCCESS);
+ return XFER_SUCCESS;
+}
+
+/*
+ * Wait for transmit completion
+ * We get the current operation and if a stop
+ * condition was expected (last byte of transmission).
+ */
+
+static int
+i2c_wait_tx_empty(bool stop, int op)
+{
+ int counter, uwt, err;
+ uint32_t temp;
+
+ I2C_PRT("i2c_wait_tx_empty: entry, stop %d, op %d\n", stop, op);
+ REG_REG("+i2c_wait_tx_empty");
+
+ /*
+ * Guess on how long one I2C clock cycle is (in uSec)
+ */
+ uwt = (bus_freq == FREQ_400K) ? 3 : 10;
+
+ /*
+ * Wait for transmission to end (ITE set)
+ * Since slave can hold the SCL to lower the speed
+ * we wait longer than we expect the transmission
+ * to last.
+ */
+ counter = 100;
+ err = INCOMPLETE_XFER;
+ while(counter) {
+ temp = reg_read(ISR_OFFSET);
+ if (temp & ISR_ITE) {
+ I2C_PRT("i2c_wait_tx_empty: ITE, ISR %02x\n", temp);
+ myDELAY(uwt);
+ temp = reg_read(ISR_OFFSET);
+ err = check_tx_isr(temp, stop, op);
+ reg_write(ISR_OFFSET, reg_read(ISR_OFFSET) | ISR_ITE);
+ break;
+ }
+ myDELAY(uwt);
+ counter--;
+ }
+
+ REG_REG("-i2c_wait_tx_empty");
+ I2C_PRT("i2c_wait_tx_empty: done, ITE wait %d uSec, err %d\n", uwt * (100 - counter), err);
+ return err;
+}
+
+
+/*
+ * Setup for a transaction.
+ * Determine transmission speed and program ICR accordingly.
+ * Also sets ISAR, though we probably don't neeed that.
+ */
+
+static int
+i2c_init(uint8_t slave_addr)
+{
+ uint32_t speed;
+
+ I2C_PRT("i2c_init: entry, slave_addr %02x, hnd_speed %d\n", slave_addr, hnd_freq);
+ REG_REG("+i2c_init");
+
+ switch(hnd_freq) {
+ case FREQ_MAX:
+ speed = I2C_HS_FAST;
+ break;
+ case FREQ_400K:
+ speed = I2C_FAST;
+ break;
+ case FREQ_100K:
+ speed = I2C_STANDARD;
+ break;
+ case FREQ_AUTO:
+#if I2C_SLOW
+ hnd_freq = FREQ_100K;
+ speed = I2C_STANDARD;
+#else
+ hnd_freq = FREQ_400K;
+ speed = I2C_FAST;
+#endif
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (bus_inited && hnd_freq == bus_freq) {
+ REG_REG("-i2c_init");
+ I2C_PRT("i2c_init: exit, bus_inited %d, hnd_freq %d\n", bus_inited, hnd_freq);
+ return 0;
+ }
+ I2C_PRT("i2c_init: speed %d, hnd_freq %d\n", bus_inited, hnd_freq);
+
+ bus_slave_addr = ISAR_SLADDR;
+ reg_write(ISAR_OFFSET, bus_slave_addr);
+ reg_write(ICR_OFFSET, (reg_read(ICR_OFFSET) & ~ICR_MODE) | ICR_ON | speed);
+ bus_freq = hnd_freq;
+ bus_inited = 1;
+
+ REG_REG("-i2c_init");
+ I2C_PRT("i2c_init: done, bus_inited %d, bus_freq %d\n", bus_inited, bus_freq);
+ return 0;
+}
+
+
+/*
+ * Stop current transaction.
+ * If transmitting then do a master abort, otherwise
+ * just ensure that no new transmission starts.
+ */
+
+static int
+i2c_stop(void)
+{
+ I2C_PRT("i2c_stop: entry, bus_inited %d, bus_start_op %d\n", bus_inited, bus_start_op);
+ REG_REG("+i2c_stop");
+
+ if (reg_read(ISR_OFFSET) & ISR_UB) {
+ I2C_PRT("i2c_stop: Unit busy\n");
+ i2c_master_abort();
+ }
+
+ switch(bus_start_op) {
+ case I2C_WRITE:
+ I2C_PRT("i2c_stop: Stop Write\n");
+ reg_write(ICR_OFFSET, reg_read(ICR_OFFSET) & ~(ICR_STOP | ICR_TB));
+ break;
+ case I2C_READ:
+ I2C_PRT("i2c_stop: Stop Read\n");
+ reg_write(ICR_OFFSET, reg_read(ICR_OFFSET) & ~(ICR_STOP | ICR_TB | ICR_ACKNAK));
+ break;
+ }
+ bus_start_op = I2C_NOP;
+
+ REG_REG("-i2c_stop");
+ I2C_PRT("i2c_stop: bus_start_op %d\n", bus_start_op);
+ return 0;
+}
+
+
+/*
+ * Reset I2C controller.
+ * Try to be nice and wait for current transaction to finish
+ */
+
+static int
+i2c_reset(void)
+{
+ I2C_PRT("i2c_reset: entry, bus_inited %d\n", bus_inited);
+ REG_REG("+i2c_reset");
+
+ i2c_stop();
+
+ reg_write(ICR_OFFSET, ICR_UR);
+ myDELAY(1);
+ reg_write(ISR_OFFSET, ~ISR_RESERVED);
+ myDELAY(1);
+ reg_write(ICR_OFFSET, 0);
+ myDELAY(1);
+ reg_write(ISAR_OFFSET, 0);
+ myDELAY(1);
+ reg_write(ICR_OFFSET, ICR_INIT_BITS);
+ bus_inited = 0;
+
+ REG_REG("-i2c_reset");
+ I2C_PRT("i2c_reset: exit, bus_inited %d\n", bus_inited);
+ return 0;
+}
+
+
+/*
+ * Start transaction using current setup.
+ * This is always a send of the target id and the R/W bit.
+ */
+
+static int
+i2c_start(int rw)
+{
+ int err;
+ uint32_t temp;
+
+ I2C_PRT("i2c_start: entry, rw %d, bus_slave_addr %02x, bus_start_op %d\n", rw, bus_slave_addr, bus_start_op);
+ REG_REG("+i2c_start");
+
+ if (hnd_addr == bus_slave_addr) {
+ bus_slave_addr = bus_slave_addr - 1;
+ I2C_PRT("i2c_start: reset slave %02x\n", bus_slave_addr);
+ reg_write(ISAR_OFFSET, bus_slave_addr);
+ }
+
+ reg_write(IDBR_OFFSET, (hnd_addr << 1) | rw);
+ temp = reg_read(ICR_OFFSET);
+ temp |= ICR_START | ICR_TB;
+ temp &= ~(ICR_STOP | ICR_ALDIE);
+ reg_write(ISR_OFFSET, ~ISR_RESERVED);
+ reg_write(ICR_OFFSET, temp);
+
+ err = i2c_wait_tx_empty(FALSE, rw);
+ if (err) {
+ i2c_reset();
+ I2C_PRT("i2c_start: exit, err %d\n", err);
+ REG_REG("-i2c_start");
+ return err;
+ }
+ bus_start_op = rw;
+
+ REG_REG("-i2c_start");
+ I2C_PRT("i2c_start: done, bus_start_op %d\n", bus_start_op);
+ return 0;
+}
+
+
+/*
+ * Read next byte of transaction
+ * Must follow a 'start' in READ mode.
+ */
+
+static int
+i2c_rd_byte(bool sendStop, uint8_t *data)
+{
+ int retval;
+ uint32_t temp;
+
+ I2C_PRT("i2c_rd_byte: entry, stop %d\n", sendStop);
+
+ if (bus_start_op != I2C_READ) {
+ I2C_PRT("i2c_rd_byte: exit, called during WR\n");
+ return -EINVAL;
+ }
+
+ REG_REG("+i2c_rd_byte");
+
+ temp = reg_read(ICR_OFFSET);
+ temp |= (ICR_ALDIE | ICR_TB);
+ temp &= ~(ICR_START | ICR_STOP | ICR_ACKNAK);
+ if (sendStop)
+ temp |= ICR_STOP | ICR_ACKNAK;
+
+ reg_write(ISR_OFFSET, ~ISR_RESERVED);
+ reg_write(ICR_OFFSET, temp);
+ retval = i2c_wait_rx_full(sendStop);
+ if (retval) {
+ REG_REG("-i2c_rd_byte");
+ I2C_PRT("i2c_rd_byte: exit, err %d\n", retval);
+ return retval;
+ }
+
+ temp = reg_read(IDBR_OFFSET);
+ if (data)
+ *data = temp;
+
+ if (sendStop)
+ i2c_stop();
+
+ REG_REG("-i2c_rd_byte");
+ I2C_PRT("i2c_rd_byte: done, data %02x\n", temp);
+ return 0;
+}
+
+/*
+ * Write next byte of transaction
+ * Must follow a 'start' in WRITE mode.
+ */
+
+static int
+i2c_wr_byte(bool sendStop, uint8_t data)
+{
+ int retval;
+ uint32_t temp;
+
+ I2C_PRT("i2c_wr_byte: entry, stop %d, data %02x\n", sendStop, data);
+
+ if (bus_start_op != I2C_WRITE) {
+ I2C_PRT("i2c_wr_byte: exit, called during RD\n");
+ return EINVAL;
+ }
+
+ REG_REG("+i2c_wr_byte");
+
+ reg_write(IDBR_OFFSET, data);
+
+ temp = reg_read(ICR_OFFSET);
+ temp |= (ICR_ALDIE | ICR_TB);
+ temp &= ~(ICR_START | ICR_STOP);
+ if (sendStop)
+ temp |= ICR_STOP;
+
+ reg_write(ISR_OFFSET, ~ISR_RESERVED);
+ reg_write(ICR_OFFSET, temp);
+ retval = i2c_wait_tx_empty(sendStop, I2C_WRITE);
+ if (retval) {
+ REG_REG("-i2c_wr_byte");
+ I2C_PRT("i2c_wr_byte: exit, err %d\n", retval);
+ return retval;
+ }
+
+ if (sendStop)
+ i2c_stop();
+
+ REG_REG("-i2c_wr_byte");
+ I2C_PRT("i2c_wr_byte: done\n");
+ return 0;
+}
+
+
+/*
+ * Get exclusive access to the I2C bus at _any_ given time.
+ *
+ * If a transaction is in progress then try to complete it
+ * in a non-destructive way. We know that the interupted
+ * activity was from the console access to the UART, which
+ * boils down to just two possible sequences, read UART
+ * register or write UART register. The acting code paths is
+ * sc16is_serial_in()
+ * -> i2c_smbus_read_byte_data
+ * -> i2c_smbus_xfer
+ * -> i2c_smbus_xfer_emulated
+ * -> i2c_transfer
+ * -> i2c_pxa_pio_xfer
+ * -> i2c_pxa_do_pio_xfer
+ * -> i2c_pxa_set_master
+ * -> i2c_pxa_start_message
+ * -> i2c_pxa_handler (repeat for all bytes)
+ * -> i2c_pxa_irq_txempty (on writes)
+ * -> i2c_pxa_irq_rxfull (on reads)
+ * -> i2c_pxa_stop_message
+ *
+ * Function i2c_pxa_handler (designed as an interrupt handler)
+ * is polled every 10 uSec, which is pretty fast for a line that
+ * clocks at 400 kHz (minimum 20 uSec to send one byte).
+ *
+ * The two sequences on the I2C bus for the UART are:
+ *
+ * Write: S <addr | W> A <reg> A <data byte> A P
+ * Read: S <addr | W> A <reg> A Sr <addr | R> A <data byte> A P
+ *
+ * where
+ * S Start sequence
+ * P Stop sequence
+ * Sr Repeated start
+ * W Write flag
+ * R Read flag
+ * A Ack (send or recv)
+ *
+ * We need the abilitity to 'borrow' the I2C bus from the PXA driver
+ * both when it is running (say on another CPU) or when it has been
+ * interrupted (NMI and Exception context).
+ *
+ * From trackers in the PXA driver we get to know the current state
+ * of the I2C transaction with the following granularity:
+ *
+ * '-' Idle
+ * 'B' Waiting for bus free
+ * 'I' Initiating transfer (i.e. send addr & direction flag)
+ * 'S' Sending byte
+ * 'R' Receving byte
+ *
+ * Last byte of the transaction can be identified by the STOP flag.
+ *
+ * The take-over sequence starts by setting an atomic variable which
+ * tells the PXA driver to wait (and retry the I2C transaction when
+ * the variable gets cleared). Then we look at the controller status
+ * and command registers to determine whether it is active or not.
+ *
+ * Simple cases:
+ * -------------
+ * state = '-'
+ * Controller is not in use by PXA driver.
+ *
+ * state 'B'
+ * Controller not actively in use yet.
+ * At worst the SCLE bit will be set, which won't affect
+ * anything in this driver since we always run as master.
+ *
+ * STOP bit set
+ * This is last byte of a transaction, we have two cases:
+ * a) Last part of a write UART register transaction.
+ * - Wait for the byte to clock out
+ * b) Last part of a read UART register transaction.
+ * - Wait for the byte to clock in, then preserve IDBR.
+ *
+ * Other cases:
+ * ------------
+ * state 'I'
+ * Starting an I2C command (Start or Start-Repeat),
+ * we have 3 sub-cases of this:
+ * a) Starting a write UART register transaction:
+ * - Wait for the byte to clock out, then transmit a
+ * 0 byte with STOP bit set. This selects RX/TX
+ * UART register without accessing it.
+ * b) Starting a read UART register transaction:
+ * - Same as case a), turn it into a NOP.
+ * c) Reversing direction during read UART register,
+ * probably need to finish the read operation:
+ * - Wait for the byte to clock out, send STOP + ACK
+ * and wait for the receive to clock in.
+ *
+ * state 'S'
+ * Since STOP bit is not set, then this is the <reg>
+ * index being transfered, two sub-cases:
+ * a) Sending <reg> of a write UART register.
+ * - Wait for the byte to clock out, then transmit a
+ * 0 byte with the STOP bit set. This inadvertantly
+ * and temporarily clears a random UART register,
+ * which may result in a null byte transmitted
+ * Since there is a retry associated, the intended
+ * register value will be written later.
+ * b) Sending <reg> of a read UART register.
+ * - Same as state 'I' case c).
+ *
+ * state 'R'
+ * Should not occur, because communications with the
+ * UART only have single byte reads, which always is
+ * accompanied by a STOP bit, and thus is covered by
+ * the simple case above. If multi-byte reads were to
+ * be used then we'd have to terminate it:
+ * - Wait for the byte to clock in, send STOP + ACK
+ * and wait for the 2nd byte to clock in.
+ * Both bytes received can be discarded, as there
+ * is no easy way to pass them to the PXA driver.
+ *
+ * Warning:
+ * Beyond this being an ugly hack, it is also not re-entrant.
+ * It can reliably interrupt the console and return it without
+ * causing too much breakage, but it cannot grab the I2C bus
+ * from itself due to the use of global variables.
+ *
+ * Warning:
+ * The synchronization between i2c_grap/i2c_release and the
+ * PXA driver can still wreck the I2C controller. Cause not
+ * known, but when it happens the PXA driver ends up repeating
+ * these log messages:
+ * i2c: error: pxa_pio_set_master: timeout
+ * i2c: msg_num: 0 msg_idx: 1 msg_ptr: 0
+ * i2c: ICR: 000017e0 ISR: 00000044
+ * i2c: log: [000000c6:000017e0:00:9a]
+ * i2c i2c-0: i2c_pxa: timeout waiting for bus free
+ * pxa_do_pio_xfer: timeout to become master
+ * pxa_pio_set_master 'B': ISR 00044, ICR 7e0, IDBR 28, IBMR 1
+ * Looks like the I2C controller gets stuck, ISR: IRF + IBB,
+ * The code failing is i2c_pxa_pio_set_master(), which points
+ * to the I2C UART as the culprit. One such case was during
+ * module load on KnF, where the only activity in the module
+ * was one ee_lock/ee_release pair, which in state 'B' should
+ * be straight forward to handle.
+ */
+
+#ifdef CONFIG_I2C_PXA
+#define PXA_SYNC 1
+#else
+#define PXA_SYNC 0
+#endif
+
+#if PXA_SYNC
+static uint32_t sv_icr, sv_isr, sv_isar, sv_idbr, ee_term;
+extern char pxa_state;
+extern atomic_t pxa_block;
+#endif
+
+static void
+i2c_grab(void)
+{
+ int uwt, n;
+ uint32_t icr, isr;
+ char * w;
+
+ I2C_PRT("i2c_grab: entry\n");
+ REG_REG("+i2c_grab");
+
+#if PXA_SYNC
+ sv_isar = reg_read(ISAR_OFFSET);
+ sv_idbr = reg_read(IDBR_OFFSET);
+ sv_icr = reg_read(ICR_OFFSET);
+ isr = sv_isr = reg_read(ISR_OFFSET);
+ if ((pxa_state == '-' || pxa_state == 'B') && !(isr & ISR_UB)) {
+ REG_REG("-i2c_grab");
+ I2C_PRT("i2c_grab: controller idle, isr %08x\n", isr);
+ return;
+ }
+ ee_term = 1;
+ I2C_PRT("i2c_grab: controller active, pxa %c\n", pxa_state);
+#else
+ isr = reg_read(ISR_OFFSET);
+ if (!(isr & ISR_UB)) {
+ REG_REG("-i2c_grab");
+ I2C_PRT("i2c_grab: controller idle, isr %08x\n", isr);
+ return;
+ }
+ I2C_PRT("i2c_grab: controller active\n");
+ w = "-";
+#endif
+
+ /*
+ * Guess on how long one I2C clock cycle is (in uSec)
+ * Note: ignore High-Speed modes, they are not used.
+ */
+ icr = reg_read(ICR_OFFSET);
+ uwt = (icr & ICR_FAST_MODE) ? 3 : 10;
+
+ /*
+ * Wait here long enough that current byte transaction
+ * on the I2C controller must have clocked all on its bus.
+ * Imperically, we've determined that length of this wait
+ * can to be in range up to a dozen I2C clocks.
+ * We probe state once per I2C clock cycle.
+ */
+ for(n = 0; n < 100 && (isr & ISR_UB); n++) {
+ /*
+ * Controller busy doing something. Whatever it is
+ * doing, it should set either ITE or IRF when done.
+ * Need to check for this independently because UB
+ * is asserted all the way from START thru STOP.
+ */
+ if (isr & (ISR_ITE | ISR_IRF))
+ break;
+ myDELAY(uwt);
+ isr = reg_read(ISR_OFFSET);
+ }
+ I2C_PRT("i2c_grab: ITE/IRF wait %d uSec, isr %02x, UB %d\n",
+ n * uwt, isr, (isr & ISR_UB) == ISR_UB);
+
+ /*
+ * Controller should have finished current byte transfer by now.
+ * If it was last byte of a transaction, we are done.
+ * In read mode we preserve the received data.
+ */
+ if (icr & ICR_STOP) {
+#if PXA_SYNC
+ if (isr & ISR_RWM)
+ sv_idbr = reg_read(IDBR_OFFSET);
+#endif
+ for(n = 0; n < 100 && (isr & ISR_UB); n++) {
+ myDELAY(uwt);
+ isr = reg_read(ISR_OFFSET);
+ }
+
+ REG_REG("-i2c_grab");
+ I2C_PRT("i2c_grab: easy case, UB wait %d uSec, bus %sclear, icr %08x, isr %08x\n",
+ n * uwt, (isr & ISR_UB) ? "NOT " : "", icr, isr);
+ return;
+ }
+
+#if PXA_SYNC
+ w = "?";
+
+ if (pxa_state == 'I') {
+ isr &= ~ISR_INTS;
+ reg_write(ISR_OFFSET, isr);
+
+ if (isr & ISR_RWM) {
+ /*
+ * Sub-case c)
+ * Start byte read and send nak+stop when received.
+ */
+ I2C_PRT("i2c_grab: state 'I', sub-case c\n");
+ icr = (icr & ~ICR_START) | (ICR_STOP | ICR_ACKNAK | ICR_TB);
+ reg_write(ICR_OFFSET, icr);
+ w = "c";
+ }
+ else {
+ /*
+ * Sub-case a) and b)
+ * Send a null byte and stop the transaction.
+ */
+ I2C_PRT("i2c_grab: state 'I', sub-case a & b\n");
+ icr = (icr & ~ICR_START) | (ICR_STOP | ICR_TB);
+ reg_write(IDBR_OFFSET, 0);
+ reg_write(ICR_OFFSET, icr);
+ w = "a & b";
+ }
+
+ myDELAY(8 * uwt);
+ isr = reg_read(ISR_OFFSET);
+ for(n = 0; n < 100 && (isr & ISR_UB); n++) {
+ myDELAY(uwt);
+ isr = reg_read(ISR_OFFSET);
+ }
+ if (*w == 'c')
+ sv_idbr = reg_read(IDBR_OFFSET);
+ }
+
+ if (pxa_state == 'S') {
+ isr &= ~ISR_INTS;
+ reg_write(ISR_OFFSET, isr);
+
+ if (isr & ISR_RWM) {
+ I2C_PRT("i2c_grab: state 'S', sub-case b\n");
+ icr = (icr & ~ICR_START) | (ICR_STOP | ICR_ACKNAK | ICR_TB);
+ reg_write(ICR_OFFSET, icr);
+ w = "b";
+ }
+ else {
+ I2C_PRT("i2c_grab: state 'S', sub-case a\n");
+ icr = (icr & ~ICR_START) | (ICR_STOP | ICR_TB);
+ reg_write(IDBR_OFFSET, 0);
+ reg_write(ICR_OFFSET, icr);
+ w = "a";
+ }
+
+ myDELAY(8 * uwt);
+ isr = reg_read(ISR_OFFSET);
+ for(n = 0; n < 100 && (isr & ISR_UB); n++) {
+ myDELAY(uwt);
+ isr = reg_read(ISR_OFFSET);
+ }
+ if (*w == 'b')
+ sv_idbr = reg_read(IDBR_OFFSET);
+ }
+#endif /* PXA_SYNC */
+
+ REG_REG("-i2c_grab");
+ I2C_PRT("i2c_grab: controller %sclear, icr %08x, isr %08x, w %s\n",
+ (isr & ISR_UB) ? "NOT " : "", icr, isr, w);
+}
+
+static void
+i2c_release(void)
+{
+ I2C_PRT("i2c_release: entry\n");
+ REG_REG("+i2c_release");
+
+#if PXA_SYNC
+#if 0
+ /*
+ * Reset I2C controller before returning it to PXA driver
+ *TBD: Usually not necessary, remove?
+ */
+ if (ee_term) {
+ I2C_PRT("i2c_release: resetting bus\n");
+ reg_write(ICR_OFFSET, ICR_UR);
+ myDELAY(2);
+ reg_write(ICR_OFFSET, 0);
+ }
+#endif
+
+ I2C_PRT("i2c_release: restore controller state\n");
+ reg_write(ISR_OFFSET, sv_isr);
+ reg_write(ICR_OFFSET, sv_icr & ~ICR_TB);
+ reg_write(ISAR_OFFSET, sv_isar);
+ reg_write(IDBR_OFFSET, sv_idbr);
+
+ if (ee_term)
+ ee_term = 0;
+#endif /* PXA_SYNC */
+
+ if (reg_read(IBMR_OFFSET) != 3)
+ I2C_PRT("i2c_release: WARNING: bus active!!!\n");
+
+ REG_REG("-i2c_release");
+ I2C_PRT("i2c_release: exit\n");
+}
+
+
+/*
+ * Layer 3 abstraction: I2C driver API (message passing).
+ *
+ * Controls data transfers to/from devices on the I2C bus.
+ * This is what device drivers should use.
+ *
+ * xfr_configure Set target address and speed
+ * xfr_start Start R/W operation
+ * xfr_write Write buffer to target
+ * xfr_read Read buffer from target
+ * xfr_rept_start Repeat-start new R/W operation
+ * xfr_reset Reset driver
+ */
+
+static int
+xfr_configure(uint8_t addr, int freq)
+{
+ XFR_PRT("xfr_configure: entry, addr %02x, freq %d\n", addr, freq);
+
+ if (freq > FREQ_AUTO || freq <= FREQ_MAX) {
+ XFR_PRT("xfr_configure: exit, invalid freq\n");
+ return -EINVAL;
+ }
+
+ if (addr & 0x80) {
+ XFR_PRT("xfr_configure: exit, invalid addr\n");
+ return -EINVAL;
+ }
+
+ hnd_addr = addr;
+ hnd_freq = freq;
+ XFR_PRT("xfr_configure: done, hnd_addr %02x, hnd_freq %d\n", hnd_addr, hnd_freq);
+ return 0;
+}
+
+
+static int
+xfr_start(int rw)
+{
+ int err;
+
+ XFR_PRT("xfr_start: entry, rw %d, hnd_addr %02x\n", rw, hnd_addr);
+
+ if (rw != I2C_WRITE && rw != I2C_READ) {
+ XFR_PRT("xfr_start: exit, op invalid\n");
+ return -EINVAL;
+ }
+
+ if (hnd_addr & 0x80) {
+ XFR_PRT("xfr_start: exit, hnd_addr %02x invalid\n", hnd_addr);
+ return -EINVAL;
+ }
+
+ err = i2c_init(hnd_addr);
+ if (err) {
+ XFR_PRT("xfr_start: i2c_init failed, err %d\n", err);
+ i2c_reset();
+ return -EIO;
+ }
+
+ err = i2c_start(rw);
+ if (err)
+ XFR_PRT("xfr_start: i2c_start failed, err %d\n", err);
+ switch(err) {
+ case INCOMPLETE_XFER:
+ i2c_stop();
+ err = -EBUSY;
+ break;
+ case TX_CONTROLLER_ERROR:
+ i2c_reset();
+ err = -ENODEV;
+ break;
+ case TX_NAK:
+ i2c_stop();
+ err = -ENXIO;
+ break;
+ }
+
+ XFR_PRT("xfr_start: done, err %d\n", err);
+ return err;
+}
+
+
+static int
+xfr_rept_start(int rw)
+{
+ int err;
+
+ XFR_PRT("xfr_rept_start: entry, rw %d, bus_start_op %d\n", rw, bus_start_op);
+
+ if (bus_start_op != I2C_READ && bus_start_op != I2C_WRITE) {
+ XFR_PRT("xfr_rept_start: exit, mode change %d\n", -ENXIO);
+ return -ENXIO;
+ }
+
+ err = i2c_start(rw);
+ if (err)
+ XFR_PRT("xfr_rept_start: i2c_start err %d\n", err);
+ switch(err) {
+ case INCOMPLETE_XFER:
+ i2c_stop();
+ err = -EBUSY;
+ break;
+ case TX_CONTROLLER_ERROR:
+ i2c_reset();
+ err = -ENODEV;
+ break;
+ case TX_NAK:
+ i2c_stop();
+ err = -ENXIO;
+ break;
+ }
+
+ XFR_PRT("xfr_rept_start: done, err %d\n", err);
+ return err;
+}
+
+
+static int
+xfr_write(bool sendStop, int cnt, uint8_t *data)
+{
+ int retval, i;
+
+ XFR_PRT("xfr_write: entry, sendStop %d, cnt %d\n", sendStop, cnt);
+
+ if (cnt < 0) {
+ XFR_PRT("xfr_write: exit, bad count %d\n", cnt);
+ return -EINVAL;
+ }
+
+ if (! cnt) {
+ XFR_PRT("xfr_write: null write\n");
+ retval = i2c_stop();
+ goto out;
+ }
+
+ if (cnt == 1) {
+ XFR_PRT("xfr_write: 1-byte write, '%02x'\n", *data);
+ retval = i2c_wr_byte(sendStop, *data);
+ goto out;
+ }
+
+ for (i = 0; i < cnt - 1; i++) {
+ XFR_PRT("xfr_write: multi-byte write %d, '%02x'\n", i, data[i]);
+ retval = i2c_wr_byte(FALSE, data[i]);
+ if (retval)
+ goto out;
+ }
+
+ XFR_PRT("xfr_write: last of multi-byte write %d, '%02x'\n", cnt - 1, data[cnt - 1]);
+ retval = i2c_wr_byte(sendStop, data[cnt - 1]);
+
+out:
+ if (retval)
+ XFR_PRT("xfr_write: post val %d\n", retval);
+ switch(retval) {
+ case INCOMPLETE_XFER:
+ i2c_stop();
+ retval = -EBUSY;
+ break;
+ case TX_CONTROLLER_ERROR:
+ i2c_reset();
+ retval = -ENODEV;
+ break;
+ case TX_NAK:
+ i2c_stop();
+ retval = -ENXIO;
+ break;
+ }
+
+ XFR_PRT("xfr_write: done, val %d\n", retval);
+ return retval;
+}
+
+
+static int
+xfr_read(bool sendStop, int cnt, uint8_t *data)
+{
+ int retval, i;
+
+ XFR_PRT("xfr_read: entry, stop %d, cnt %d\n", sendStop, cnt);
+
+ if (cnt < 0) {
+ XFR_PRT("xfr_read: exit, bad count %d\n", cnt);
+ return -EINVAL;
+ }
+
+ if (! cnt) {
+ XFR_PRT("xfr_read: null read\n");
+ retval = i2c_stop();
+ goto out;
+ }
+
+ if (cnt == 1) {
+ XFR_PRT("xfr_read: 1-byte read\n");
+ retval = i2c_rd_byte(sendStop, data);
+ goto out;
+ }
+
+ for (i = 0; i < cnt - 1; i++) {
+ XFR_PRT("xfr_read: multi-byte read %d\n", i);
+ retval = i2c_rd_byte(FALSE, data ? &data[i] : data);
+ if (retval)
+ goto out;
+ }
+
+ XFR_PRT("xfr_read: last of multi-byte read %d\n", cnt - 1);
+ retval = i2c_rd_byte(sendStop, data ? &data[cnt - 1] : data);
+
+out:
+ if (retval) {
+ XFR_PRT("xfr_read: post val %d\n", retval);
+ i2c_reset();
+ retval = -ENXIO;
+ }
+
+ XFR_PRT("xfr_read: done, err %d\n", retval);
+ return retval;
+}
+
+
+#if NOT_YET
+static void
+xfr_reset(void)
+{
+ i2c_reset();
+}
+#endif
+
+
+
+/*
+**
+** UART support for printing from exception context.
+** A somewhat crude implementation of two low level
+** routines that write/read CSRs on the I2C UART.
+** On top of these two functions, a set of mid-layer
+** routines adds init/exit and character based I/O.
+** We try not to alter the UART's transmission setup
+** in order lower the risk of corrupting normal use.
+**
+** All UART support routines assume I2C controller
+** to be initialized by xfr_configure() and expects
+** exclusive access to the device
+**
+*/
+
+
+/*
+ * Weird way to say that the I2C UART has slave address
+ * 0x4D (or 0x48) and the UART registers are in bits
+ * [6:3] of the register address byte.
+ * KnF has both I2C UART address pins wired to Vss.
+ * KnC MPI has the address pins wired to Vdd instead.
+ *TBD: That's according to the schematics, in reality
+ * on A0 CRBs the address of the onboard UART is
+ * 0x4D, which matches address pins wired to Vss.
+ * Not sure why that changed.
+ */
+
+#ifdef CONFIG_ML1OM
+#define SC16IS_ADDR_0 1
+#define SC16IS_ADDR_1 1
+#endif
+#ifdef CONFIG_MK1OM /* KAA: MPI specific or KnC specific ? */
+#define SC16IS_ADDR_0 1
+#define SC16IS_ADDR_1 1
+#endif
+#define SC16IS_ADDR(a1, a0) \
+ (0x40 | (((a1 + 8) + (a1 * 3)) | a0))
+#define SC16IS_SUBADDR(addr, ch) \
+ ((addr & 0xf) << 3) | ((ch & 3) << 1)
+
+
+static uint8_t
+cons_getreg(int reg)
+{
+ uint8_t sub, val;
+ int err;
+
+ CON_PRT("cons_getreg: reg %02x\n", reg);
+
+ /*
+ * The SC16IS740 device reads 8-bit UART registers
+ * by first writing the register index and then in
+ * an subsequent read operation gets the register
+ * value. The two operations can (and probably
+ * should) be joined by a repeated start to save
+ * the intermediate stop signaling.
+ */
+ val = 0;
+ sub = (uint8_t) SC16IS_SUBADDR(reg, 0);
+ err = xfr_start(I2C_WRITE);
+ if (err) {
+ CON_PRT("cons_getreg: xfr_start (WR) err %d\n", err);
+ return 0;
+ }
+ err = xfr_write(FALSE, 1, &sub);
+ if (err) {
+ CON_PRT("cons_getreg: xfr_write (%02x) err %d\n", sub, err);
+ return 0;
+ }
+ err = xfr_rept_start(I2C_READ);
+ if (err) {
+ CON_PRT("cons_getreg: xfr_rept_start (RD) err %d\n", err);
+ return 0;
+ }
+ err = xfr_read(TRUE, 1, &val);
+ if (err) {
+ CON_PRT("cons_getreg: xfr_read err %d\n", err);
+ return 0;
+ }
+
+ CON_PRT("cons_getreg: reg %02x, val %02x\n", reg, val);
+ return val;
+}
+
+
+static void
+cons_setreg(int reg, int val)
+{
+ uint8_t payload[2];
+ int err;
+
+ CON_PRT("cons_setreg: reg %02x, val %02x\n", reg, val);
+
+ payload[0] = (uint8_t) SC16IS_SUBADDR(reg, 0);
+ payload[1] = (uint8_t) val;
+ CON_PRT("cons_setreg: I2C payload %02x, %02x\n", payload[0], payload[1]);
+ err = xfr_start(I2C_WRITE);
+ if (err) {
+ CON_PRT("cons_setreg: xfr_start (WR) err %d\n", err);
+ return;
+ }
+ err = xfr_write(TRUE, 2, payload);
+ if (err)
+ CON_PRT("cons_getreg: xfr_write (%02x, %02x) err %d\n", payload[0], payload[1], err);
+}
+
+
+static void
+cons_init(void)
+{
+ /*
+ * For now assume that the kernel LXA driver or the
+ * bootstrap code has setup the I2C uart properly, i.e.
+ * we don't need to alter speed/databits/stopbits/parity
+ * or any other serial properties.
+ *
+ *WARNING: Since the switch of console from the I2C uart to
+ * the virtual console, the uart is left with default
+ * serial port speed of 9600 baud. Bootstrap blasts
+ * it's messages at 115200 baud, so now the choice
+ * of getting garbage from this routine or from the
+ * bootstrap. Using program stty from userspace may
+ * set any baudrate, we cannot override it here!
+ * # stty 115200 < /dev/ttyS0
+ *TBD: make 115200 baud default on I2C uart!
+ */
+ CON_PRT("cons_init: pass\n");
+}
+
+
+static void
+cons_exit(void)
+{
+ CON_PRT("cons_exit: pass\n");
+}
+
+
+#if NOT_YET
+static int
+cons_rxrdy(void)
+{
+ int val;
+
+ CON_PRT("cons_rxrdy: check console RxRdy\n");
+
+ val = (cons_getreg(UART_LSR) & UART_LSR_DR) ? 1 : 0;
+
+ CON_PRT("cons_rxrdy: RxRdy %d\n", val);
+ return val;
+}
+
+
+static int
+cons_getc(void)
+{
+ int c;
+
+ CON_PRT("cons_getc: rd from console\n");
+
+ while((cons_getreg(UART_LSR) & UART_LSR_DR) == 0)
+ myDELAY(1000);
+ c = cons_getreg(UART_RX);
+
+ CON_PRT("cons_getc: read '%02x'\n", c);
+ return c;
+}
+#endif
+
+
+static void
+cons_putc(int c)
+{
+ int limit;
+
+ CON_PRT("cons_putc: wr '%02x' to console\n", c);
+
+ limit = 10;
+ while((cons_getreg(UART_LSR) & UART_LSR_THRE) == 0 && --limit) ;
+ CON_PRT("cons_putc: THRE ready, limit %d\n", limit);
+ cons_setreg(UART_TX, c);
+
+#if 0
+ /*
+ * No reason to wait for it to clock out
+ */
+ limit = 10;
+ while((cons_getreg(UART_LSR) & UART_LSR_TEMT) == 0 && --limit) ;
+ CON_PRT("cons_putc: TEMT ready, limit %d\n", limit);
+#endif
+
+ CON_PRT("cons_putc: done printing '%02x'\n", c);
+}
+
+
+/*
+ * Simple exclusive access method for the 'OverClock' I2C bus.
+ * The POST-card UART is the only known other party using this
+ * bus under normal circumstances (because it is the console).
+ * If the POST-card UART is built into the kernel, the lock is
+ * in file 'drivers/serial/8250_sc16is7xx.c'. Otherwise the lock
+ * is local to the RAS module.
+ *
+ * Warning:
+ * This locking works perfectly in standard contexts and in
+ * the MCA handling contexts. However, they do not mix safely.
+ * If the ee_lock is taken from standard context, then an
+ * MCA event may hang because it cannot get the lock, ever!
+ * This can happen when/if ee_print() is used.
+ */
+
+#ifdef CONFIG_I2C_PXA
+extern atomic_t pxa_block;
+extern char pxa_state;
+#else
+atomic_t pxa_block = ATOMIC_INIT(0);
+char pxa_state = '-';
+#endif
+
+static void
+ee_lock(void)
+{
+ /*
+ * Wait here until lock ackquired
+ */
+ while(atomic_xchg(&pxa_block, 1))
+ myDELAY(50);
+
+ /*
+ * Lock taken, I2C transaction could be underway.
+ * Wait for it to end or forcefully terminate it.
+ */
+ i2c_grab();
+}
+
+static void
+ee_unlock(void)
+{
+ i2c_release();
+ atomic_xchg(&pxa_block, 0);
+}
+
+
+/*
+ * Printf to the POST card UART.
+ *
+ * Function ee_printk() and ee_print() both creates
+ * a message into a local buffer from where the RAS
+ * timer will synch them into the kernel log about
+ * once a second. ee_printk() is thread safe.
+ *
+ * Function ee_print() will also attempt to write to
+ * the POST card serial port, which may be useful
+ * from exception context where OS services are out
+ * of the question.
+ *
+ * WARNING: ee_print() takes the same lock as
+ * the machine checks does, so if a machine check
+ * happens while a standard context thread are in
+ * this code we'll have an instant kernel hang.
+ */
+
+char ee_buf[EE_BUF_COUNT * EE_BUF_LINELEN];
+atomic_t ee_msg = ATOMIC_INIT(-1);
+atomic_t ee_seen = ATOMIC_INIT(-1);
+int ee_rdy;
+
+#define EE_TSC 0 /* 1 to get rdtsc() included */
+
+char *
+ee_fmt(char * fmt, va_list args)
+{
+ char * buf;
+ int msg_id, tsl;
+#if EE_TSC
+ uint64_t ts = rdtsc();
+#endif
+
+ msg_id = atomic_inc_return(&ee_msg);
+ buf = ee_buf + (msg_id % EE_BUF_COUNT) * EE_BUF_LINELEN;
+ if (! *buf) {
+#if EE_TSC
+ tsl = snprintf(buf, EE_BUF_LINELEN - 1, "[%lld] ", ts);
+#else
+ tsl = 0;
+#endif
+ vsnprintf(buf + tsl, EE_BUF_LINELEN - 1 - tsl, fmt, args);
+ return buf;
+ }
+ return 0;
+}
+
+int
+ee_printk(char * fmt, ...)
+{
+ va_list args;
+ char * buf;
+
+ va_start(args, fmt);
+ buf = ee_fmt(fmt, args);
+ va_end(args);
+
+ return buf ? strlen(buf) : 0;
+}
+
+int
+ee_print(char * fmt, ...)
+{
+ char ch, * buf;
+ va_list args;
+ int len;
+
+ va_start(args, fmt);
+ buf = ee_fmt(fmt, args);
+ va_end(args);
+
+ len = 0;
+ if (ee_rdy && buf) {
+ /*
+ * Get I2C bus exclusive access,
+ * setup for targeting the UART and
+ * send string one byte at a time
+ * with lf -> lr/cr translation.
+ */
+ ee_lock();
+ xfr_configure(SC16IS_ADDR(SC16IS_ADDR_1, SC16IS_ADDR_0), FREQ_AUTO);
+ while((ch = *(buf++))) {
+ if (ch == '\n') {
+ cons_putc('\r');
+ len++;
+ }
+ cons_putc(ch);
+ len++;
+ }
+ ee_unlock();
+ }
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(ee_print);
+
+
+
+/*
+**
+** EEPROM support routines
+**
+** The device is a 1 Mbit Atmel AT24C1024 which has 128
+** KByte addressable storage over 2 slave addresses.
+** Lower 64 KB is at slave address 0x54 and upper
+** 64KB is at slave address 0x55, i.e. it uses LSB of
+** the slave address as bit 16 of the byte address.
+**
+** All EEPROM support routines assume I2C controller
+** to be initialized by xfr_configure() and expects
+** exclusive access to the device
+**
+** Only KnC has this storage
+*/
+
+#ifdef CONFIG_MK1OM
+
+#define MR_ELOG_SIZE (128 * 1024) /* 1 Mbit */
+#define MR_ELOG_ADDR_LO 0x54 /* Lo 64K slave */
+#define MR_ELOG_ADDR_HI 0x55 /* Hi 64K slave */
+#define EE_PG_SIZ 256 /* Device page size */
+
+
+/*
+ * Layout of the EEPROM is roughly like this:
+ *
+ * Bytes Content
+ * 0 - 15 Fixed log header
+ * 16 - 17 Log head index (last written)
+ * 18 - 19 Log tail index (last read)
+ * 20 - end Log entries
+ *
+ * By definition, the log is fully read when head and
+ * tail pointer are equal (initial value: last entry).
+ * The effective log size is
+ * (device_size - sizeof(McaHeader))/sizeof(McaRecord).
+ *
+ * Fields of interest in the log entry 'id' are
+ * bits 7:0 Source index, 8 bit
+ * bits 18:16 Source type, 3 bit
+ * bits 22:22 Injected error flag
+ * bits 23:23 Repaired flag
+ * bits 24:24 Filtered flag
+ * bits 31:31 Valid flag
+ *
+ * Enumeration details are in file micras_mca.h
+ *
+ * Time stamps in the MCA header and event records are supposed to be
+ * standard 32-bit Unix format, i.e. seconds since 00:00 Jan 1 1979 GMT.
+ * This will wrap some time Jan 19th 2038, which is about 25 years from
+ * the release of KnC. Given the use of 386's (introduced 1985) in the
+ * modern data center anno '12, 32 bit will last for all practical purposes.
+ */
+
+typedef struct _mca_header {
+ uint8_t signature[8]; /* Magic */
+ uint8_t header_ver; /* Format revision */
+ uint8_t rec_start; /* Offset of 1st record */
+ uint16_t rec_size; /* Size of an MCA record */
+ uint16_t entries; /* Log size */
+ uint8_t logfull; /* Log has wrapped (reserved) */
+ uint8_t hwtype; /* Board type (reserved) */
+ uint16_t rec_head; /* Head index */
+ uint16_t rec_tail; /* Tail index */
+} McaHeader;
+
+typedef struct _mca_record {
+ uint32_t id; /* Event origin & flags */
+ uint32_t stamp; /* Low 32 bit of system time */
+ uint64_t ctl; /* MCA bank register 'CTL' */
+ uint64_t status; /* MCA bank register 'STATUS' */
+ uint64_t addr; /* MCA bank register 'ADDR' */
+ uint64_t misc; /* MCA bank register 'MISC' */
+} McaRecord;
+
+
+/*
+ * Header to drop onto un-initalized EEPROM
+ * By definition, the EEPROM is uninitialised
+ * if the magic signature is wrong.
+ */
+
+#define MR_ELOG_NUM (MR_ELOG_SIZE - sizeof(McaHeader))/sizeof(McaRecord)
+
+static McaHeader elog_preset = {
+ .signature = {"MCA_LOG"},
+ .header_ver = 1,
+ .rec_start = sizeof(McaHeader),
+ .rec_size = sizeof(McaRecord),
+ .entries = MR_ELOG_NUM,
+ .logfull = -1,
+ .hwtype = 0,
+ .rec_head = MR_ELOG_NUM - 1,
+ .rec_tail = MR_ELOG_NUM - 1,
+};
+
+static uint16_t ee_num, ee_head, ee_tail; /* Cached log state */
+
+
+#if EPR_DBG || EE_VERIFY
+/*
+ * Printk from EEPROM code.
+ * We have the lock, and the I2C target address is
+ * set for the Atmel device, we must reset I2C for
+ * the UART on every entry, and reset it back to the
+ * EEPROM in order to keep this function transparent.
+ *
+ * Warning: this call is highly risky, particularly
+ * in error conditions where the I2C bus is involved.
+ * Do not call it during an EEPROM I2C transaction!!
+ * Use for internal debug _ONLY_ and at own risk.
+ */
+
+int
+elog_print(char * fmt, ...)
+{
+ char * buf, ch;
+ va_list args;
+ int len;
+
+ va_start(args, fmt);
+ buf = ee_fmt(fmt, args);
+ va_end(args);
+
+ if (! buf)
+ return 0;
+
+ xfr_configure(SC16IS_ADDR(SC16IS_ADDR_1, SC16IS_ADDR_0), FREQ_AUTO);
+
+ len = 0;
+ while((ch = *(buf++))) {
+ if (ch == '\n') {
+ cons_putc('\r');
+ len++;
+ }
+ cons_putc(ch);
+ len++;
+ }
+
+ return len;
+}
+#endif /* EPR_DBG */
+
+
+/*
+ * Write block of data to EEPROM
+ * The Atmel device does not allow writes to cross the
+ * internal page size, which is 256 bytes on the 1 Mbit part.
+ * Given the size of an McaRecord this is likely to occur, but
+ * cannot happen more than once per call.
+ * Must preset slave address on every call.
+ */
+
+static void
+ee_wr(uint8_t addr, uint16_t ofs, uint8_t *buf, uint8_t len)
+{
+ uint16_t pix, swp;
+ uint8_t wl;
+ int err;
+
+ if (mce_disabled)
+ return;
+
+ if ((ofs + len) < ofs) {
+ EPR_PRT("ee_wr: address overrun\n");
+ return;
+ }
+
+ xfr_configure(addr, FREQ_AUTO);
+
+ pix = ofs & (EE_PG_SIZ - 1);
+ while(len) {
+ wl = (uint8_t) min((uint16_t)len, (uint16_t)(EE_PG_SIZ - pix));
+
+ err = xfr_start(I2C_WRITE);
+ if (err) {
+ EPR_PRT("ee_wr: xfr_start (WR) err %d\n", err);
+ return;
+ }
+
+ /*
+ * Byte swap, send Most significant byte first
+ */
+ swp = (ofs >> 8) | (ofs << 8);
+ err = xfr_write(FALSE, 2, (uint8_t *) &swp);
+ if (err) {
+ EPR_PRT("ee_wr: xfr_write offset (%02x, %02x) err %d\n", ofs >> 8, ofs & 0xff, err);
+ return;
+ }
+
+ /*
+ * Write payload to device
+ */
+ err = xfr_write(TRUE, wl, buf);
+ if (err) {
+ EPR_PRT("ee_wr: xfr_write %d bytes (%02x, %02x ..) err %d\n", wl, buf[0], buf[1], err);
+ return;
+ }
+ ofs += wl;
+ buf += wl;
+ len -= wl;
+ pix = 0;
+
+ /*
+ * Data sheet says wait 5 mSec before next
+ * transaction to the device after a write.
+ */
+ myDELAY(5000);
+ }
+}
+
+
+/*
+ * Read block of data from EEPROM
+ * Must preset slave address on every call.
+ */
+
+static void
+ee_rd(uint8_t addr, uint16_t ofs, uint8_t *buf, uint8_t len)
+{
+ uint16_t swp;
+ int err;
+
+ if ((ofs + len) < ofs) {
+ EPR_PRT("ee_rd: address overrun\n");
+ return;
+ }
+
+ xfr_configure(addr, FREQ_AUTO);
+
+ err = xfr_start(I2C_WRITE);
+ if (err) {
+ EPR_PRT("ee_rd: xfr_start (WR) err %d\n", err);
+ return;
+ }
+
+ /*
+ * Byte swap, send Most significant byte first
+ */
+ swp = (ofs >> 8) | (ofs << 8);
+ err = xfr_write(FALSE, 2, (uint8_t *) &swp);
+ if (err) {
+ EPR_PRT("ee_rd: xfr_write (%02x, %02x) err %d\n", ofs >> 8, ofs & 0xff, err);
+ return;
+ }
+
+ /*
+ * Change bus direction and read payload
+ */
+ err = xfr_rept_start(I2C_READ);
+ if (err) {
+ EPR_PRT("ee_rd: xfr_rept_start (RD) err %d\n", err);
+ return;
+ }
+ err = xfr_read(TRUE, len, buf);
+ if (err) {
+ EPR_PRT("ee_rd: xfr_read err %d\n", err);
+ return;
+ }
+}
+
+
+/*
+ * Read one MCA event record from EEPROM
+ * Handles crossing device addresses.
+ */
+
+static void
+ee_get(McaRecord * rec, int no)
+{
+ uint32_t pos, mid, low;
+
+ mid = MR_ELOG_SIZE / 2;
+ memset(rec, '\0', sizeof(*rec));
+ pos = sizeof(McaHeader) + no * sizeof(McaRecord);
+ if (pos < (mid - sizeof(McaRecord))) {
+ /*
+ * Record fit entirely in lower half of EEPROM
+ */
+ ee_rd(MR_ELOG_ADDR_LO, pos, (uint8_t *) rec, sizeof(*rec));
+ }
+ else
+ if (pos > mid) {
+ /*
+ * Record fit entirely in upper half of EEPROM
+ */
+ ee_rd(MR_ELOG_ADDR_HI, pos - mid, (uint8_t *) rec, sizeof(*rec));
+ }
+ else {
+ /*
+ * Record spans both halves, need 2 reads.
+ */
+ low = mid - pos;
+ ee_rd(MR_ELOG_ADDR_LO, pos, (uint8_t *) rec, low);
+ ee_rd(MR_ELOG_ADDR_HI, 0, ((uint8_t *) rec) + low, sizeof(*rec) - low);
+ }
+}
+
+
+/*
+ * Write one MCA event record to EEPROM
+ * Handles crossing device addresses.
+ */
+
+static void
+ee_put(McaRecord * rec, int no)
+{
+ uint32_t loc, mid, low;
+
+ mid = MR_ELOG_SIZE / 2;
+ loc = sizeof(McaHeader) + no * sizeof(McaRecord);
+ if (loc < (mid - sizeof(McaRecord))) {
+ /*
+ * Record fit entirely in lower half of EEPROM
+ */
+ ee_wr(MR_ELOG_ADDR_LO, loc, (uint8_t *) rec, sizeof(*rec));
+ }
+ else
+ if (loc > mid) {
+ /*
+ * Record fit entirely in upper half of EEPROM
+ */
+ ee_wr(MR_ELOG_ADDR_HI, loc - mid, (uint8_t *) rec, sizeof(*rec));
+ }
+ else {
+ /*
+ * Record spans both halves, need 2 writes.
+ */
+ low = mid - loc;
+ ee_wr(MR_ELOG_ADDR_LO, loc, (uint8_t *) rec, low);
+ ee_wr(MR_ELOG_ADDR_HI, 0, ((uint8_t *) rec) + low, sizeof(*rec) - low);
+ }
+}
+
+
+/*
+ * Add one MCA event to the EEPROM
+ * Store the passed event info in the EEPROM, and update write
+ * position to next entry, just in case if there are more than
+ * one MC event detected that needs checking in maintenance mode.
+ *
+ * This can be called in exception context, and therefore must
+ * work without any kernel support whatsoever. We must assume
+ * kernel services are not reliable at this point.
+ */
+
+void
+micras_mc_log(struct mce_info * event)
+{
+ McaRecord mr;
+ uint16_t nxt, id;
+
+ if (mce_disabled)
+ return;
+
+ /*
+ * Print entry on serial console (copy in kernel log)
+ */
+#if MC_VERBOSE
+ ee_printk("RAS.elog: bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n",
+ event->org, event->id, event->ctl, event->status, event->addr, event->misc);
+#endif
+
+ /*
+ * Bail if EEPROM not in order (I2C lock-up or faulty device)
+ */
+ if (! ee_num)
+ return;
+
+ /*
+ * Prepare MCA error log record.
+ * We use the pysical CPU ID in the EEPROM records.
+ */
+ id = (event->org <= 2) ? event->pid : event->id;
+ mr.id = PUT_BITS( 7, 0, id) |
+ PUT_BITS(18, 16, event->org) |
+ PUT_BIT(22, (event->flags & MC_FLG_FALSE) != 0) |
+ PUT_BIT(24, (event->flags & MC_FLG_FILTER) != 0) |
+ PUT_BIT(31, 1);
+ mr.stamp = (uint32_t) event->stamp;
+ mr.ctl = event->ctl;
+ mr.status = event->status;
+ mr.addr = event->addr;
+ mr.misc = event->misc;
+
+#if ADD_DIE_TEMP
+ {
+ uint32_t tmp;
+ tmp = mr_sbox_rl(0, SBOX_THERMAL_STATUS_2);
+ mr.id |= PUT_BITS(15, 8, GET_BITS(19, 10, tmp));
+ }
+#endif
+
+ /*
+ * Get I2C bus exclusive access
+ */
+ ee_lock();
+
+#if EE_VERIFY
+ {
+ /*
+ * Check for header corruption.
+ * Time sink, only enable for debugging
+ */
+ extern int in_sync;
+ McaHeader hdr;
+
+ ee_rd(MR_ELOG_ADDR_LO, 0, (uint8_t *) &hdr, sizeof(hdr));
+ if (memcmp(hdr.signature, elog_preset.signature,
+ sizeof(elog_preset.signature))) {
+ if (in_sync) {
+ printk("mc_log: Header corruption detected\n");
+ dmp_hex(&hdr, sizeof(hdr), "mc_log: EEPROM header (entry)");
+ }
+ else {
+ elog_print("mc_log: Header corruption detected (entry)\n");
+ elog_print("EEPROM header: signature bad, ver %d, type %d\n",
+ hdr.header_ver, hdr.hwtype);
+ elog_print("EEPROM capacity: %d events, size %d, start %d\n",
+ hdr.entries, hdr.rec_size, hdr.rec_start);
+ elog_print("EEPROM state: head %d, tail %d, full %d\n",
+ hdr.rec_head, hdr.rec_tail, hdr.logfull);
+ }
+ }
+ }
+#endif
+
+ nxt = (ee_head + 1) % ee_num;
+ if (nxt == ee_tail) {
+ ee_printk("RAS.elog: EEPROM full, dropping event\n");
+ ee_unlock();
+ return;
+ }
+ ee_put(&mr, nxt);
+
+#if EE_VERIFY
+ {
+ /*
+ * Read back and verify with memory buffer
+ * Note: only works on 1st half of device.
+ * Time sink, only enable for debugging
+ */
+ McaRecord tst;
+
+ ee_rd(MR_ELOG_ADDR_LO, loc, (uint8_t *) &tst, sizeof(tst));
+ if (memcmp(&mr, &tst, sizeof(tst)))
+ elog_print("Write event verify failed\n");
+ else
+ elog_print("Write event verify OK\n");
+ }
+#endif
+
+ /*
+ * Update head pointer in EEPROM header
+ */
+ ee_wr(MR_ELOG_ADDR_LO, offsetof(McaHeader, rec_head), (uint8_t *) &nxt, sizeof(nxt));
+ ee_head = nxt;
+
+#if EE_VERIFY
+ {
+ /*
+ * Read back and verify with memory buffer
+ * Time sink, only enable for debugging
+ */
+ uint16_t tst;
+
+ ee_rd(MR_ELOG_ADDR_LO, 16, (uint8_t *) &tst, 2);
+ if (tst != nxt)
+ elog_print("Write index verify failed\n");
+ else
+ elog_print("Write index verify OK\n");
+ }
+
+ {
+ /*
+ * Check again for header corruption
+ * Time sink, only enable for debugging
+ */
+ extern int in_sync;
+ McaHeader hdr;
+
+ ee_rd(MR_ELOG_ADDR_LO, 0, (uint8_t *) &hdr, sizeof(hdr));
+ if (memcmp(hdr.signature, elog_preset.signature,
+ sizeof(elog_preset.signature))) {
+ if (in_sync) {
+ printk("mc_log: Header corruption detected (exit)\n");
+ dmp_hex(&hdr, sizeof(hdr), "mc_log: EEPROM header");
+ }
+ else {
+ elog_print("mc_log: Header corruption detected (exit)\n");
+ elog_print("EEPROM header: signature bad, ver %d, type %d\n",
+ hdr.header_ver, hdr.hwtype);
+ elog_print("EEPROM capacity: %d events, size %d, start %d\n",
+ hdr.entries, hdr.rec_size, hdr.rec_start);
+ elog_print("EEPROM state: head %d, tail %d, full %d\n",
+ hdr.rec_head, hdr.rec_tail, hdr.logfull);
+ }
+ }
+ }
+#endif
+
+ /*
+ * Release I2C bus exclusive lock
+ */
+ ee_unlock();
+}
+
+
+/*
+ * Reset the EEPROM to mint condition
+ */
+
+#define BSIZ 0xf0
+
+static void
+ee_mint(void)
+{
+ uint8_t buf[EE_PG_SIZ];
+ McaHeader hdr;
+ uint32_t loc, mid;
+ uint16_t ofs;
+ uint8_t addr;
+
+
+ if (ee_rdy && ! mce_disabled) {
+ printk("EEPROM erase started ..\n");
+ memset(buf, 0xff, sizeof(buf));
+
+ ee_lock();
+
+ /*
+ * Several cheats in this loop.
+ * - Despite maximum transfer per write command is 255 (8 bit count),
+ * we send only half a 'page', i.e. 128 byte, per call to ee_wr().
+ * - Picking exactly half a page, starting page aligned, ensures there
+ * will be no writes across a page boundary, i.e. ee_wr() will always
+ * result in exactly one I2C write command per call.
+ * - We know that MR_ELOG_SIZE / (EE_PG_SIZ / 2) is a clean integer,
+ * and therefore will be no end condition to special case.
+ * - Same will be true for the 'mid-chip' limit where the target
+ * address is bumped by one.
+ */
+ mid = MR_ELOG_SIZE / 2;
+ for(loc = 0; loc < MR_ELOG_SIZE; loc += (EE_PG_SIZ / 2)) {
+ addr = (loc < mid) ? MR_ELOG_ADDR_LO : MR_ELOG_ADDR_HI;
+ ofs = loc & 0xffff;
+ // printk(" -- loc %5x: addr %2x, offs %4x, len %4x\n", loc, addr, ofs, EE_PG_SIZ / 2);
+ ee_wr(addr, ofs, buf, EE_PG_SIZ / 2);
+ }
+
+ /*
+ * Put in a fresh header
+ */
+ ee_wr(MR_ELOG_ADDR_LO, 0, (uint8_t *) &elog_preset, sizeof(elog_preset));
+ ee_rd(MR_ELOG_ADDR_LO, 0, (uint8_t *) &hdr, sizeof(hdr));
+ printk("EEPROM erase complete\n");
+
+ ee_unlock();
+
+ /*
+ * Verify that the header stuck.
+ * If not, then complain to kernel log and set event capacity to 0
+ */
+ if (memcmp(hdr.signature, elog_preset.signature, sizeof(elog_preset.signature)) ||
+ hdr.header_ver != elog_preset.header_ver ||
+ hdr.rec_start != elog_preset.rec_start ||
+ hdr.rec_size != elog_preset.rec_size ||
+ hdr.hwtype != elog_preset.hwtype) {
+ /*
+ * Write EEPROM header failed.
+ * Leave a message in the kernel log about it.
+ */
+ printk("Error: EEPROM initialization failed!\n");
+ printk("MCA events cannot be logged to EEPROM\n");
+ ee_num = 0;
+ }
+ else {
+ ee_num = hdr.entries;
+ ee_head = hdr.rec_head;
+ ee_tail = hdr.rec_tail;
+ printk("EEPROM ready!\n");
+ }
+
+
+ }
+}
+
+
+#if EE_PROC
+/*
+ * Support for user space access to the EEPROM event log.
+ * Implemented as a 'proc' file named elog, who returns
+ * MCE events on read and on writes of 6 hex values
+ * per line creates new event(s) to be entered.
+ *
+ * Compile time configurable for disabling writes and
+ * choice of whether to dump new events or everything.
+ */
+
+static struct proc_dir_entry * elog_pe;
+
+/*
+ * Write is just a simple file operation.
+ * We do not care about file offset since the specified event is to
+ * be added to the EEPROM at head+1, not at any arbitrary location.
+ */
+
+static ssize_t
+elog_write(struct file * file, const char __user * buff, size_t len, loff_t * off)
+{
+ char * buf;
+ uint16_t nxt;
+ McaRecord mr;
+ uint64_t ull[6];
+ char * ep, * cp;
+ int i, err;
+
+ /*
+ * Get input line into kernel space
+ */
+ if (len > PAGE_SIZE -1)
+ len = PAGE_SIZE -1;
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (! buf)
+ return -ENOMEM;
+ if (copy_from_user(buf, buff, len)) {
+ err = -EFAULT;
+ goto wr_out;
+ }
+ buf[len] = '\0';
+ cp = ep = (char *) buf;
+
+ /*
+ * Special case EEPROM reset option,
+ * first 5 letters form the word 'reset'
+ */
+ if (!strncmp(buf, "reset", 5)) {
+ ee_mint();
+ goto wr_one;
+ }
+
+ /*
+ * Need 6 numbers for an event record
+ */
+ for(i = 0; i < 6; i++) {
+ while(isspace(*cp))
+ cp++;
+ ull[i] = simple_strtoull(cp, &ep, 16);
+ if (ep == cp || (*ep != '\0' && !isspace(*ep))) {
+ err = -EINVAL;
+ goto wr_out;
+ }
+ cp = ep;
+ }
+
+#if 0
+ /*
+ * If we were to screen this the we should ensure that
+ * id[7:0] < CPU_MAX on org 0, 1, 2
+ * < DBOX_NUM on org 3
+ * == 0 on org 4
+ * < GBOX_NUM on org 5
+ * < TBOX_NUM on org 6
+ * id[18:16] <= 6
+ * id[23] == 0
+ * id[31] == 1
+ */
+#endif
+
+ if (ee_num) {
+ mr.id = (uint32_t) ull[0];
+ mr.stamp = (uint32_t) ull[1];
+ mr.ctl = ull[2];
+ mr.status = ull[3];
+ mr.addr = ull[4];
+ mr.misc = ull[5];
+
+ /*
+ * Add event record under I2C bus exclusive access
+ */
+ ee_lock();
+ nxt = (ee_head + 1) % ee_num;
+ ee_put(&mr, nxt);
+ ee_wr(MR_ELOG_ADDR_LO, offsetof(McaHeader, rec_head), (uint8_t *) &nxt, sizeof(nxt));
+ ee_head = nxt;
+ ee_unlock();
+ }
+
+ /*
+ * Swallow any trailing junk up to next newline
+ */
+wr_one:
+ ep = strchr(buf, '\n');
+ if (ep)
+ cp = ep + 1;
+ err = cp - buf;
+
+wr_out:
+ kfree(buf);
+ return err;
+}
+
+
+/*
+ * Use the sequencer to read one event at a time,
+ * in order of occurrence in the EEPROM. Sequence
+ * position is event index in range 0 .. ee_num,
+ * which will be offset by (ee_tail + 1) modulo
+ * ee_num if EE_PROC_NEW flag is set.
+ */
+
+static int elog_eof; /* Elog end-of-file marker */
+
+static int
+elog_seq_show(struct seq_file * f, void * v)
+{
+ McaRecord mr;
+ int pos, nxt;
+ static int inv;
+
+ pos = *(loff_t *) v;
+
+ /*
+ * Print nice header on 1st read from /proc/elog
+ */
+ if (! pos) {
+ extern struct mr_rsp_hwinf hwinf;
+ struct mr_rsp_hwinf * r = &hwinf;
+
+ inv = 0;
+ seq_printf(f, "Card %c%c%c%c%c%c%c%c%c%c%c%c: "
+ "brd %d, fab %d, sku %d, rev %d, stp %d, sub %d\n",
+ r->serial[0], r->serial[1], r->serial[2], r->serial[3],
+ r->serial[4], r->serial[5], r->serial[6], r->serial[7],
+ r->serial[8], r->serial[9], r->serial[10], r->serial[11],
+ r->board, r->fab, r->sku, r->rev, r->step, r->substep);
+ if (ee_num) {
+ seq_printf(f, "Head %d, tail %d, cap %d\n", ee_head, ee_tail, ee_num);
+ seq_printf(f, "%5s %8s %12s %8s %16s %16s %16s %16s\n",
+ "index", "id", "id decode", "time", "ctrl", "status", "addr", "misc");
+ }
+ else
+ seq_printf(f, "Error: EEPROM not initialized\n");
+ }
+
+ /*
+ * Set EOF and quit if EEPROM not accessible
+ */
+ if (! ee_num) {
+ elog_eof = 1;
+ return 0;
+ }
+
+ /*
+ * Get event under I2C bus exclusive access
+ */
+#if EE_PROC_NEW
+ nxt = (pos + ee_tail + 1) % ee_num;
+#else
+ nxt = pos;
+#endif
+ ee_lock();
+ ee_get(&mr, nxt);
+ ee_unlock();
+
+#if ! EE_PROC_NEW
+ /*
+ * We refuse to print invalid entries.
+ * However, a freshly reset EEPROM contains all 1s and
+ * therefore we won't rely on the valid-bit alone.
+ * Instead rely on the unused areas of 'id' to be 0s.
+ * Probably need to stop sequencer once a bad entry is
+ * seen because in all likelihood we've reached the
+ * log end and reading the remainder of the EEPROM will
+ * just be waste of time.
+ */
+ if (GET_BITS(30, 25, mr.id) == 0x3f &&
+ GET_BITS(21, 19, mr.id) == 0x07 &&
+ GET_BITS(15, 8, mr.id) == 0xff) {
+ if (inv++ > 10)
+ elog_eof = 1;
+ return 0;
+ }
+#endif
+
+ seq_printf(f, "%5d %08x [%d %3d %c%c%c%c] %08x %016llx %016llx %016llx %016llx\n",
+ nxt, mr.id,
+ GET_BITS(18,16,mr.id),
+ GET_BITS(7,0,mr.id),
+ GET_BIT(22,mr.id) ? 'I' : ' ',
+ GET_BIT(23,mr.id) ? 'R' : ' ',
+ GET_BIT(24,mr.id) ? 'F' : ' ',
+ GET_BIT(31,mr.id) ? 'V' : ' ',
+ mr.stamp, mr.ctl, mr.status, mr.addr, mr.misc);
+
+ return 0;
+}
+
+static void *
+elog_seq_start(struct seq_file * f, loff_t * pos)
+{
+ if (ee_num) {
+ if (*pos >= ee_num)
+ return NULL;
+#if EE_PROC_NEW
+ /*
+ * Skip checks if we are dumping full log
+ */
+ if (ee_head == ee_tail)
+ return NULL;
+ if (*pos && ((*pos + ee_tail) % ee_num) == ee_head)
+ return NULL;
+#endif
+ }
+
+ elog_eof = 0;
+
+ return pos;
+}
+
+static void *
+elog_seq_next(struct seq_file * f, void * v, loff_t * pos)
+{
+ if (elog_eof)
+ return NULL;
+
+ (*pos)++;
+ if (*pos >= ee_num)
+ return NULL;
+
+#if EE_PROC_NEW
+ /*
+ * No wrap checks if we are dumping full log
+ */
+ {
+ int nxt;
+
+ nxt = ((*pos) + ee_tail) % ee_num;
+ if (nxt == ee_head)
+ return NULL;
+ }
+#endif
+
+ return pos;
+}
+
+static void
+elog_seq_stop(struct seq_file * f, void * v)
+{
+}
+
+static const struct seq_operations elog_seq_ops = {
+ .start = elog_seq_start,
+ .next = elog_seq_next,
+ .stop = elog_seq_stop,
+ .show = elog_seq_show,
+};
+
+static int
+elog_open(struct inode *inode, struct file *filp)
+{
+ return seq_open(filp, &elog_seq_ops);
+}
+
+static struct file_operations proc_elog_operations = {
+ .open = elog_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = elog_write,
+};
+
+#endif /* EE_PROC */
+
+
+
+/*
+**
+** Validation hooks.
+**
+** ee_list List EEPROM contents to kernel log
+** ee_wipe Clear EEPROM (after RAS testing)
+**
+** Used by validation, exported entry point
+** Do not enable this in production code.
+**
+*/
+
+void
+ee_list(void)
+{
+ McaHeader hdr;
+ McaRecord rec;
+ int pos, i;
+
+ /*
+ * Get I2C bus exclusive access
+ */
+ ee_lock();
+
+ /*
+ * Read header
+ */
+ ee_rd(MR_ELOG_ADDR_LO, 0, (uint8_t *) &hdr, sizeof(hdr));
+ if (! strncmp(hdr.signature, "MCA_LOG", sizeof(hdr.signature))) {
+ printk("MCE log header: signature OK, ver %d, type %d\n",
+ hdr.header_ver, hdr.hwtype);
+ printk("MCE log capacity: %d events, size %d, start %d\n",
+ hdr.entries, hdr.rec_size, hdr.rec_start);
+ printk("MCE log state: head %d, tail %d, full %d\n",
+ hdr.rec_head, hdr.rec_tail, hdr.logfull);
+ if (hdr.entries != MR_ELOG_NUM) {
+ printk("MCE log check: invalid capacity, expected %ld\n", MR_ELOG_NUM);
+ goto ee_bad;
+ }
+ if (hdr.rec_size != sizeof(McaRecord)) {
+ printk("MCE log check: invalid rec size, expected %ld\n", sizeof(McaRecord));
+ goto ee_bad;
+ }
+ if (hdr.rec_tail != ee_tail ||
+ hdr.rec_head != ee_head) {
+ printk("MCE log check: cached h/t mismatch %d/%d\n", ee_head, ee_tail);
+ goto ee_bad;
+ }
+ if (hdr.entries != ee_num) {
+ printk("MCE log check: cached capacity mismatch %d\n", ee_num);
+ goto ee_bad;
+ }
+
+ /*
+ * Header looks OK,
+ * Dump all valid entries in eeprom
+ */
+ for(i = 0; i < hdr.entries; i++) {
+ ee_get(&rec, i);
+
+ /*
+ * Uninitialized parts have all FFs in them,
+ * need to screen those before testing the valid bit
+ */
+ if (rec.id != 0xffffffff && GET_BIT(31, rec.id)) {
+#if EE_VERIFY
+ dmp_hex(&rec, sizeof(rec), "ee_list: Entry[%d]", i);
+#endif
+ pos = hdr.rec_start + i * hdr.rec_size;
+ printk("Log %4d (pos %06x): id %08x, "
+ "ctrl %016llx, stat %016llx, addr %016llx, misc %016llx, time %d\n",
+ i, pos, rec.id, rec.ctl, rec.status,
+ rec.addr, rec.misc, rec.stamp);
+ }
+ }
+ }
+ else {
+ printk("MCE log header: bad signature %02x%02x%02x%02x%02x%02x%02x%02x\n",
+ hdr.signature[0], hdr.signature[1], hdr.signature[2], hdr.signature[3],
+ hdr.signature[4], hdr.signature[5], hdr.signature[6], hdr.signature[7]);
+ }
+
+ee_bad:
+ /*
+ * Release I2C bus exclusive lock
+ */
+ ee_unlock();
+}
+EXPORT_SYMBOL_GPL(ee_list);
+
+void
+ee_wipe(void)
+{
+#if 1
+ printk("Wiping EEPROM disabled, call ignored\n");
+#else
+ ee_mint();
+#endif
+}
+EXPORT_SYMBOL_GPL(ee_wipe);
+#endif /* CONFIG_MK1OM */
+
+
+/*
+**
+** Setup access to the EEPROM on KnC
+** This include initializing the local I2C driver and
+** locating the next write position in the EEPROM.
+** We want to limit the exception time activity to
+** a minimum and thus make preparations up front.
+** This is expected to happen before enabling the
+** MC event intercepts.
+**
+*/
+
+int
+ee_init(void)
+{
+#if 0
+ /*
+ * Clocking the delay loop.
+ * Average results over 3 runs:
+ * uSec % off
+ * 1 12.46
+ * 2 6.22
+ * 4 4.34
+ * 8 3.41
+ * 16 2.90
+ * 32 2.65
+ * 64 2.52
+ * 128 2.46
+ * 256 2.43
+ * 512 2.41
+ * 1024 2.41
+ * 2048 6.30
+ * 4096 2.43
+ * 8192 3.28
+ * 16384 3.30
+ * 32768 3.42
+ * , which is fine for the purposes in this driver.
+ */
+ {
+ uint64_t t1, t2;
+ uint64_t usec, pwr;
+
+ printk("RAS.test: tsc_khz %d\n", tsc_khz);
+ for(pwr = 0; pwr < 16; pwr++) {
+ usec = 1UL << pwr;
+ t1 = rdtsc();
+ myDELAY(usec);
+ t2 = rdtsc();
+ printk("RAS.test: myDelay(%lld) => %lld clocks\n", usec, t2 - t1);
+ }
+ }
+#endif
+
+#ifdef CONFIG_MK1OM
+ if (! mce_disabled) {
+ McaHeader hdr;
+
+#ifndef CONFIG_I2C_PXA
+ /*
+ * Reset I2C controller if PXA driver is not included in the kernel.
+ */
+ i2c_reset();
+#endif
+
+ /*
+ * Get I2C bus exclusive access
+ */
+ ee_lock();
+
+ /*
+ * Paranoia!!
+ * At this point the I2C controller should be inactive and
+ * the I2C bus should be idle. Verify this to be true.
+ * Note: This check is only applied on this very first
+ * access to the I2C controller. If it passed the
+ * two criterias we _assume_ we have good hardware.
+ * TBD: should we assume that the I2C subsystem can go bad
+ * at runtime and add more checking?
+ */
+ ee_num = 0;
+ if ((reg_read(ISR_OFFSET) & ISR_UB) || (reg_read(IBMR_OFFSET) != 3)) {
+ printk("RAS.elog: I2C unit out of control, cannot access EEPROM\n");
+ }
+ else {
+ /*
+ * Get EEPROM header and cache log state.
+ */
+ ee_rd(MR_ELOG_ADDR_LO, 0, (uint8_t *) &hdr, sizeof(hdr));
+ if (memcmp(hdr.signature, elog_preset.signature, sizeof(elog_preset.signature)) ||
+ hdr.header_ver != elog_preset.header_ver ||
+ hdr.rec_start != elog_preset.rec_start ||
+ hdr.rec_size != elog_preset.rec_size ||
+ hdr.hwtype != elog_preset.hwtype) {
+ printk("RAS.elog: Found un-initialized EEPROM, initializing ..\n");
+ ee_wr(MR_ELOG_ADDR_LO, 0, (uint8_t *) &elog_preset, sizeof(elog_preset));
+ ee_rd(MR_ELOG_ADDR_LO, 0, (uint8_t *) &hdr, sizeof(hdr));
+ }
+
+ if (memcmp(hdr.signature, elog_preset.signature, sizeof(elog_preset.signature)) ||
+ hdr.header_ver != elog_preset.header_ver ||
+ hdr.rec_start != elog_preset.rec_start ||
+ hdr.rec_size != elog_preset.rec_size ||
+ hdr.hwtype != elog_preset.hwtype) {
+ /*
+ * Write to EEPROM header failed.
+ * Leave a message in the kernel log about it and set capacity to 0.
+ */
+ printk("RAS.elog: Error: EEPROM initialization failed!\n");
+ }
+ else {
+ ee_num = hdr.entries;
+ ee_head = hdr.rec_head;
+ ee_tail = hdr.rec_tail;
+ printk("RAS.elog: rev %d, size %d, head %d, tail %d\n",
+ hdr.header_ver, ee_num, ee_head, ee_tail);
+ if (ee_head != ee_tail) {
+ /*
+ *TBD: should we be aggressive and replay these events to the host
+ * when it opens the MC SCIF channel to force the issue?
+ */
+ printk("RAS.elog: Warning: MCA log has unprocessed entries\n");
+ }
+ }
+ }
+ if (!ee_num)
+ printk("RAS.elog: MCA events cannot be logged to EEPROM\n");
+
+ /*
+ * Release I2C bus exclusive lock
+ */
+ ee_unlock();
+ }
+#endif /* CONFIG_MK1OM */
+
+ /*
+ * Reset I2C bus & UART (sort of, internal reset only)
+ */
+ xfr_configure(SC16IS_ADDR(SC16IS_ADDR_1, SC16IS_ADDR_0), FREQ_AUTO);
+ cons_init();
+ ee_rdy = 1;
+
+#if defined(CONFIG_MK1OM) && EE_PROC
+ /*
+ * Create proc file
+ * We allow writes if EE_INJECT is defined or during manufacturing.
+ */
+ {
+ int mode;
+#if EE_INJECT
+ mode = 0644;
+#else
+ uint32_t smc_err, smc_val, smc_fwv;
+
+ /*
+ * HSD 4846538
+ * Needs SMC FW 1.8 or later to be safe to use.
+ * Read FW version; if failed then not at manufacturing.
+ * If FW version 1.8 or later go read Zombie register.
+ * If zombie register responded we're at manufacturing,
+ */
+ mode = 0444;
+ smc_err = gmbus_i2c_read(2, 0x28, 0x11, (uint8_t *) &smc_fwv, sizeof(smc_fwv));
+ if (smc_err == sizeof(smc_fwv) && GET_BITS(31, 16, smc_fwv) >= 0x0108) {
+ smc_err = gmbus_i2c_read(2, 0x28, 0x1b, (uint8_t *) &smc_val, sizeof(smc_val));
+ if (smc_err == sizeof(uint32_t))
+ mode = 0644;
+ }
+ if (mode == 0444)
+ proc_elog_operations.write = 0;
+#endif
+ elog_pe = proc_create("elog", mode, 0, &proc_elog_operations);
+ }
+#endif
+
+#if 0
+ /*
+ * Say hello on the console
+ */
+ ee_printk("RAS: ee_print ready, uart adr %02x\n",
+ SC16IS_ADDR(SC16IS_ADDR_1, SC16IS_ADDR_0));
+#endif
+
+ if (mce_disabled)
+ printk("RAS.elog: disabled\n");
+ else
+ printk("RAS.elog: init complete\n");
+ return 0;
+}
+
+
+/*
+ * Cleanup for module unload.
+ * Free any resources held by this driver
+ */
+
+int
+ee_exit(void)
+{
+#if defined(CONFIG_MK1OM) && EE_PROC
+ if (elog_pe) {
+ remove_proc_entry("elog", 0);
+ elog_pe = 0;
+ }
+#endif
+
+
+ /*
+ * Reset I2C bus & UART (sort of, internal reset only)
+ */
+ ee_rdy = 0;
+ xfr_configure(SC16IS_ADDR(SC16IS_ADDR_1, SC16IS_ADDR_0), FREQ_AUTO);
+ cons_exit();
+
+ printk("RAS.elog: exit complete\n");
+ return 0;
+}
+
+#endif /* EMULATION */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * RAS MT module driver
+ *
+ * Code and data structures to handle get/set tasks for KnC.
+ * Parties accessing the data structures are supposed to use the
+ * micras_mt_tsk() routines to ensure integrity and consistency.
+ * Particularly important when handling sysfs nodes and actions
+ * requested from SCIF connections must use that method in order
+ * to guarantee serialized access.
+ *
+ * Even if read-only access to latest valid data is required,
+ * it should go through micras_mt_tsk() using dedicated handlers
+ * in this module.
+ *
+ * Apologies for the messy code, but hardware support to report
+ * board properties at this time (Power-On of A0) is so erratic
+ * that odd ways of obtaining the info had to replace the POR
+ * methods. The SMC support is sporadic, A0 has issues with SVID
+ * and some SBOX registers are invalid because they depend on
+ * TMU telemetry transmissions from the SMC which some reason
+ * has been forgotten/missed/defeatured (does not happen).
+ *
+ * TBD: Once the dust settles there will be code to remove.
+ * But until then, lots of #ifdef's remains.
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/io.h>
+#include <linux/utsname.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/jiffies.h>
+#include <linux/kernel_stat.h>
+#include <linux/bitmap.h>
+#include <asm/mic/mic_knc/autobaseaddress.h>
+#include <asm/mic/mic_knc/micsboxdefine.h>
+#include "micras.h"
+
+
+/*
+ * Persistent data accessible through the CP api.
+ * Some functions just read/modify hardware CSRs
+ * and thus need no storage between invocations.
+ */
+
+extern struct mr_rsp_hwinf hwinf;
+extern struct mr_rsp_vers vers;
+extern struct mr_rsp_volt volt;
+extern struct mr_rsp_freq freq;
+extern struct mr_rsp_power power;
+extern struct mr_rsp_plim plim;
+extern struct mr_rsp_gddr gddr;
+extern struct mr_rsp_gvolt gvolt;
+extern struct mr_rsp_gfreq gfreq;
+extern struct mr_rsp_temp temp;
+extern struct mr_rsp_ecc ecc;
+extern struct mr_rsp_trbo trbo;
+extern struct mr_rsp_pmcfg pmcfg;
+
+#if USE_SVID
+static uint8_t vccp_cap, vddq_cap, vddg_cap;
+static uint8_t vccp_imax, vddq_imax, vddg_imax;
+#endif
+
+uint8_t xlat_cpu[NR_CPUS];
+
+#define FIX_DBOX 1
+
+#if FIX_DBOX
+/*
+ * Pre-emptive restoring DBOX-0 register access.
+ * A glitch during clock speed changes (PM or GPU_HOT)
+ * may under some rare circumstances break access to DBOX
+ * registers. It is very rare, requires hours of tailored
+ * simulation to reproduce, never seen in the wild (yet).
+ * The gmbus controller sits in the DBOX and is affected.
+ * Calling this routine prior to every gmbus read/write
+ * reduces risk of hitting this bug to a single SMC register,
+ * which has been deemed acceptable for B-step KnCs.
+ * Only alternative is to perform repeated transaction(s)
+ * until a stable result is obtained, which will be costly
+ * in performance.
+ */
+static void
+mr_smc_deglitch(void)
+{
+ mr_dbox_rl(0, 0x600);
+ mr_dbox_rl(0, 0x2440);
+}
+#else
+#define mr_smc_deglitch(); /* As nothing */
+#endif
+
+
+/*
+**
+** Conversion between CP formats (uV, MHz, etc.)
+** and hardware register formats (SMC and VRs mostly).
+**
+*/
+
+
+/*
+ * PLL tables used to map between hw scale register
+ * value and actual frequencies given a fixed base.
+ *
+ * The core frequency (MCLK) formula is
+ * freq = Icc * (Feedback / Feedforward)
+ * where
+ * Icc = Frequency generated from ICC, nominal 200 MHz
+ * FeedBack = ratio bits 8:1 (valid range: 8 .. 16)
+ * FeedForward = ratio bits 10:9 (01 -> 4, 10 -> 2, 11 -> 1)
+ *
+ * The gddr frequency (PGCLK) formula is
+ * freq = (X / 2) * Feedback / Feedforward
+ * where
+ * X = SBPLL (ICC) Table 1, FB range 10..22
+ * X = LCVCO (ICC/2) Table 2, FB range 44..65
+ * X = Bypass (ICC/2) Table 3, FB range 20..44
+ * which is why there's three gddr tables. The divide by 2 of
+ * 'X' is represented as doubling the FF dividers in the tables.
+ *
+ * Overlapping ranges over feedback and feedforward values are
+ * handled by range table(s) below such that lower frequencies
+ * can be selected at a finer granularity. The tables themselves
+ * do not allow overlaps, i.e. two ways to specify the same
+ * PLL output frequency.
+ *
+ * Note that ICC clocks have their own PLL built in which uses
+ * the PCI-E 100 MHz clock, adds SSC and scale it by a pair of
+ * dividers. One divider is (I'm told) fixed at 40, the other
+ * is fused, and none of them can be read from uOS at runtime.
+ * The fused dividers are nominally 20, which is what the
+ * tables below is based on. Some SKUs tweak the core ICC PLL
+ * by fuses, so to counter it that divider is reported in scr #4.
+ * No means to know if gddr ICC PLL gets tweaked too.
+ *
+ *WARNING: there are overlabs on the divider codes for GDDR PLLs,
+ * which theoretically can cause false reporting of GDDR
+ * device speeds (example: FB dividers 20, 21, and 22 are
+ * defined both in gddr_tab1 and gddr_tab3). Currently
+ * there is no way to determine which table is used.
+ */
+
+struct pll_tab {
+ uint8_t clk_div; /* Feed forward */
+ uint8_t min_mul; /* Lower feedback */
+ uint8_t max_mul; /* Upper feedback */
+ uint16_t min_clk; /* Lower frequency */
+ uint16_t max_clk; /* Upper frequency */
+ uint8_t step_size; /* Granularity */
+} cpu_tab[] = { /* CPU PLL, ICC @ ~200 MHz */
+ {1, 8, 16, 1600, 3200, 200},
+ {2, 8, 15, 800, 1500, 100},
+ {4, 8, 15, 400, 750, 50},
+}, gddr_tab1[] = { /* GDDR PLL, ICC @ 200 MHz */
+ {2, 10, 22, 1000, 2200, 100},
+ {4, 10, 22, 500, 1100, 50},
+ {8, 10, 22, 250, 550, 25},
+}, gddr_tab2[] = { /* GDDR PLL, LCVCO @ 100 MHz */
+ {2, 44, 65, 2200, 3250, 50},
+}, gddr_tab3[] = { /* GDDR PLL, ICC bypass @ 100 MHz */
+ {2, 20, 44, 1000, 2200, 100},
+ {4, 20, 44, 500, 1100, 50},
+ {8, 20, 44, 250, 550, 25},
+};
+
+#define ICC_NOM 20 /* Nominal ICC feed back divider */
+
+static uint16_t
+ratio2freq(uint16_t ratio, struct pll_tab * tab, int tablen, uint16_t base)
+{
+ uint16_t fwd, bck;
+
+ fwd = GET_BITS(10, 9, ~ratio);
+ bck = GET_BITS(8, 1, ratio);
+
+ if (tab == gddr_tab3 && (bck & 1))
+ return 0;
+
+ if (fwd < tablen && bck >= tab[fwd].min_mul && bck <= tab[fwd].max_mul)
+ return (base * bck) / tab[fwd].clk_div;
+
+ return 0;
+}
+
+static uint16_t
+freq2ratio(uint16_t freq, struct pll_tab * tab, int tablen, uint16_t base)
+{
+ int fwd;
+
+ for(fwd = tablen - 1; fwd >= 0; fwd--) {
+ if (freq >= tab[fwd].min_clk && freq <= tab[fwd].max_clk) {
+ /*
+ * Why bother check for accurate input?
+ * Ignoring it just rounds down to nearest supported!
+ */
+ if (freq % tab[fwd].step_size)
+ break;
+
+ return PUT_BITS(10, 9, ~fwd) |
+ PUT_BITS( 8, 1, (freq * tab[fwd].clk_div) / base);
+ }
+ }
+
+ return 0;
+}
+
+static uint32_t
+icc_fwd(void)
+{
+ uint32_t scr4, div;
+
+ scr4 = mr_sbox_rl(0, SBOX_SCRATCH4);
+ div = GET_BITS(29, 25, scr4);
+
+ return div ? div : ICC_NOM;
+}
+
+static uint32_t
+mr_mt_gf_r2f(uint16_t pll)
+{
+ uint64_t freq;
+
+ /*
+ * As per HSD 4118175, ICC clock at 200 MHz is currently not
+ * used on any SKUs, and is unlikely to be used in the future.
+ * Therefore, the 100 MHz tables are searched first.
+ */
+ freq = ratio2freq(pll, gddr_tab3, ARRAY_SIZE(gddr_tab3), 100);
+ if (! freq)
+ freq = ratio2freq(pll, gddr_tab2, ARRAY_SIZE(gddr_tab2), 100);
+ if (! freq)
+ freq = ratio2freq(pll, gddr_tab1, ARRAY_SIZE(gddr_tab1), 200);
+
+ return 1000 * freq;
+}
+
+static uint32_t
+mr_mt_cf_r2f(uint16_t pll)
+{
+ uint64_t freq;
+
+ freq = ratio2freq(pll, cpu_tab, ARRAY_SIZE(cpu_tab), 200);
+
+ return (1000 * freq * ICC_NOM) / icc_fwd();
+}
+
+
+#if USE_SVID
+/*
+ * VRM12 voltage converters
+ * Only bits 7:0 are being used as follows:
+ * Volt = Min + Res * (Bits -1)
+ * Bits = 1 + (Volt - Min) / Res
+ * Bits value of 0 reserved for turning VR off.
+ */
+
+#define VRM12_MAX 1520000 /* 1.52 V */
+#define VRM12_MIN 250000 /* 250 mV */
+#define VRM12_RES 5000 /* 5.0 mV */
+
+static uint32_t
+svid2volt(uint8_t svid)
+{
+ uint32_t bits;
+
+ bits = GET_BITS(7, 0, svid);
+ if (bits)
+ return VRM12_MIN + VRM12_RES * (bits - 1);
+ else
+ return 0;
+}
+
+static uint8_t
+volt2svid(uint32_t uv)
+{
+ uint32_t delta, bits;
+
+ bits = 0;
+ if (uv >= VRM12_MIN && uv <= VRM12_MAX) {
+ delta = uv - VRM12_MIN;
+ /*
+ * Why bother check for accurate input?
+ * Ignoring it just rounds up to nearest!
+ */
+ if (! (delta % VRM12_RES))
+ bits = 1 + delta / VRM12_RES;
+ }
+ return PUT_BITS(7, 0, bits);
+}
+
+
+/*
+ * SVID register scaling:
+ *
+ * Vin = SVID_REG(0x1A) <unknown>
+ * Iin = SVID_REG(0x19) 1:1 A
+ * Pin = SVID_REG(0x1B) 1:1 W
+ * Vout = SVID_REG(0x16) / 128 V
+ * Iout = SVID_REG(0x15) 1:1 A
+ * Pout = SVID_REG(0x18) 1:1 W
+ * Iout = (SVID_REG(0x15) / ADCmax) * (SVID_REG(0x21) A
+ * Temp = SVID_REG(0x17) 1:1 C
+ *
+ * Note: SVID_REG(0x06) bit 7 tells Iout formula.
+ * Assuming 8-bit ADC => ADCmax to be 0xff.
+ *
+ * Inputs are SVID register values, outputs are u{V|A|W}.
+ */
+
+static uint32_t
+vout2volt(uint8_t vout)
+{
+ /*
+ * Linear range from 0 to 2 volt
+ */
+ return (((uint32_t) vout) * 1000000) / 128;
+}
+
+static uint32_t
+vin2volt(uint8_t vin)
+{
+ /*
+ * Formula not known.
+ */
+ return (((uint32_t) vin) * 1000000) / 128;
+}
+
+static uint32_t
+one2one(uint8_t in)
+{
+ return ((uint32_t) in) * 1000000;
+}
+
+static uint32_t
+iout2amp(uint8_t iout, uint8_t cap, uint8_t imax)
+{
+ if (GET_BITS(7, 7, cap))
+ return (((uint32_t) iout) * ((uint32_t) imax) * 1000000) / 256;
+ else
+ return one2one(iout);
+}
+
+#define iin2amp(iin) one2one(iin)
+#define pin2watt(pin) one2one(pin)
+#define pout2watt(pout) one2one(pout)
+
+
+
+/*
+**
+** Simple SVIDCONTROL interface.
+**
+** 0 Parity bit out
+** 8:1 SVID data out
+** 13:9 SVID command
+** 17:14 SVID address
+** 18 Parity bit in (if any)
+** 26:19 SVID data in (if any)
+** 27 ACK #0
+** 28 ACK #1
+** 29 SVID Error
+** 30 CTL Idle
+** 31 CMD Start
+**
+** See SBOX HAS for more details.
+** One transaction is expected to finish
+** in less than 2 uSec (15.625 MHz clock)
+** and busy waiting here should be OK.
+**
+** Return values:
+** 0 OK
+** 1-7 Controller bits 29:27
+** 8 Parameter error (invalid device or opcode)
+**
+*/
+
+/*
+ * SVID command set
+ * Source: SVID Protocol rev 1.5
+ */
+#define VR12Cmd_Extend 0x00 /* Req */
+#define VR12Cmd_SetVID_Fast 0x01 /* Req */
+#define VR12Cmd_SetVID_Slow 0x02 /* Req */
+#define VR12Cmd_SetVID_Decay 0x03 /* Req */
+#define VR12Cmd_SetPS 0x04 /* Req */
+#define VR12Cmd_SetRegADR 0x05 /* Req */
+#define VR12Cmd_SetRegDAT 0x06 /* Req */
+#define VR12Cmd_GetReg 0x07 /* Req */
+#define VR12Cmd_TestMode 0x08 /* Req */
+
+/*
+ * SVID registers of interest
+ * Source: SVID Protocol rev 1.5
+ *
+ * Notes on the capability register:
+ * bit 0 Iout (0x15)
+ * bit 1 Vout (0x16)
+ * bit 2 Pout (0x18)
+ * bit 3 Iin (0x19)
+ * bit 4 Vin (0x1a)
+ * bit 5 Pin (0x1b)
+ * bit 6 Temp (0x17)
+ * bit 7 Iout format of register 0x15
+ * 0 -> value in Amps
+ * 1 -> value scaled to Icc_Max
+ */
+
+#define VR12Reg_VendorID 0x00 /* Req */
+#define VR12Reg_ProductID 0x01 /* Req */
+#define VR12Reg_ProductRev 0x02 /* Req */
+#define VR12Reg_ProductDate 0x03 /* Opt */
+#define VR12Reg_LotCode 0x04 /* Opt */
+#define VR12Reg_ProtocolID 0x05 /* Req */
+#define VR12Reg_Capability 0x06 /* Req */
+#define VR12Reg_Iout 0x15 /* Req */
+#define VR12Reg_Vout 0x16 /* Opt */
+#define VR12Reg_Temp 0x17 /* Opt */
+#define VR12Reg_Pout 0x18 /* Opt */
+#define VR12Reg_Iin 0x19 /* Opt */
+#define VR12Reg_Vin 0x1a /* Opt */
+#define VR12Reg_Pin 0x1b /* Opt */
+#define VR12Reg_Icc_Max 0x21 /* Req */
+#define VR12Reg_Temp_Max 0x22 /* Req */
+#define VR12Reg_Vout_Max 0x30 /* Req */
+#define VR12Reg_VID_Set 0x31 /* Req */
+
+/*
+ * SVID addresses on KnC
+ */
+#define SVID_VCCP 0x0 /* Core rail */
+#define SVID_VDDQ 0x2 /* Memory rail (1st loop) */
+#define SVID_VDDG 0x3 /* Uncore rail (2nd loop) */
+
+static DEFINE_SPINLOCK(svidcontrol_lock);
+
+static int
+SvidCmd(uint8_t dev, uint8_t op, uint8_t in)
+{
+ uint32_t cmd, ret, err;
+
+ /*
+ * The SVID Controller does not work in A0 (HSD 3498464)
+ * Pretend success, but return 0 always
+ */
+ return 0;
+
+ /*
+ * For now just check that command can be contructed.
+ *
+ *TBD: Add stricter parameter check?
+ */
+ if (dev > GET_BITS(17, 14, ~0) ||
+ op > GET_BITS(13, 9, ~0))
+ return -MR_ERR_SMC;
+
+ /*
+ * Craft 18 bit command with even parity
+ */
+ cmd = PUT_BITS( 8, 1, in) |
+ PUT_BITS(13, 9, op) |
+ PUT_BITS(17, 14, dev);
+ if (bitmap_weight((unsigned long *) &cmd, 18) & 1)
+ cmd |= 1;
+
+ /*
+ * Wait until controller in idle state,
+ * write command + start bit and then
+ * wait for controller to be idle again.
+ */
+ spin_lock(&svidcontrol_lock);
+ for( ;; ) {
+ ret = mr_sbox_rl(0, SBOX_SVIDCONTROL);
+ if (GET_BITS(31, 30, ret) == 0x1)
+ break;
+ }
+ mr_sbox_wl(0, SBOX_SVIDCONTROL, cmd | PUT_BIT(31, 1));
+ for( ;; ) {
+ ret = mr_sbox_rl(0, SBOX_SVIDCONTROL);
+ if (GET_BITS(31, 30, ret) == 0x1)
+ break;
+ }
+ spin_lock(&svidcontrol_lock);
+
+ /*
+ * Report command status
+ * Only if SVID_Error = 0, Ack #1 = 1, and Ack #0 = 0
+ * did we have a successful transfer, and have data
+ * to return (SBOX HAS table 9).
+ */
+ err = GET_BITS(29, 27, ret);
+ return (err == 0x2) ? GET_BITS(26, 19, ret) : -MR_ERR_SMC;
+}
+#endif
+
+
+
+/*
+**
+** SMC API
+**
+** See "Knights Corner System Managment Architecture Specification"
+** for details on the SMC internals and supported APIs.
+**
+** This module is based on rev 0.31
+**
+*/
+
+#define MR_SMC_ADDR 0x28 /* SMC DVO-B Slave address */
+
+#define MR_SMC_PCI_VID 0x00 /* PCI Vendor ID, 4 */
+#define MR_SMC_PCI_DID 0x02 /* PCI Device ID, 4 */
+#define MR_SMC_PCI_BCC 0x04 /* PCI Base Class Code, 4 */
+#define MR_SMC_PCI_SCC 0x05 /* PCI Sub Class Code, 4 */
+#define MR_SMC_PCI_PI 0x06 /* PCI Programming Interface, 4 */
+#define MR_SMC_PCI_SMBA 0x07 /* PCI MBus Manageability Address, 4 */
+#define MR_SMC_UUID 0x10 /* Universally Unique Identification, 16 */
+#define MR_SMC_FW_VERSION 0x11 /* SMC Firmware Version, 4 */
+#define MR_SMC_EXE_DOMAIN 0x12 /* SMC Execution Domain, 4 */
+#define MR_SMC_STS_SELFTEST 0x13 /* SMC Self-Test Results, 4 */
+#define MR_SMC_HW_REVISION 0x14 /* SMC Hardware Revision, 4 */
+#define MR_SMC_SERIAL 0x15 /* Card serial number, 12 */
+#define MR_SMC_SMB_RESTRT 0x17 /* Restart SMBus addr negotiation, 4 */
+
+#define MR_SMC_CPU_POST 0x1a /* POST Register, 4 */
+#define MR_SMC_ZOMBIE 0x1b /* Zombie Mode Enable, 4 */
+#define MR_SMC_CPU_ID 0x1c /* CPU Identifier, 4 */
+
+#define MR_SMC_SEL_ENTRY_SEL 0x20 /* SEL Entry Selection Register, 4 */
+#define MR_SMC_SEL_DATA 0x21 /* SEL Data register, <N> */
+#define MR_SMC_SDR_ENTRY_SEL 0x22 /* SDR Entry Selection Register, 4 */
+#define MR_SMC_SDR_DATA 0x23 /* SDR Data register, <N> */
+
+#define MR_SMC_PWR_PCIE 0x28 /* PCIe Power Reading, 4 */
+#define MR_SMC_PWR_2X3 0x29 /* 2x3 Power Reading, 4 */
+#define MR_SMC_PWR_2X4 0x2a /* 2x4 Power Reading, 4 */
+#define MR_SMC_FORCE_TTL 0x2b /* Forced Throttle, 4 */
+#define MR_SMC_PWR_LIM_0 0x2c /* Power Limit 0, 4 */
+#define MR_SMC_TIME_WIN_0 0x2d /* Time Window 0, 4 */
+#define MR_SMC_PWR_LIM0_GRD 0x2e /* Power Limit 0 Guardband, 4 */
+#define MR_SMC_PWR_LIM_1 0x2f /* Power Limit 1, 4 */
+#define MR_SMC_TIME_WIN_1 0x30 /* Time Window 1, 4 */
+#define MR_SMC_INCL_3V3 0x31 /* Include 3.3 V, 4 */
+#define MR_SMC_PWR_LIM_PERS 0x32 /* Power Limit Persistence, 4 */
+#define MR_SMC_CLAMP_MODE 0x33 /* Clamp Mode, 4 */
+#define MR_SMC_ENERGY_STS_0 0x34 /* Energy Status 0, 4 */
+#define MR_SMC_AVG_PWR_0 0x35 /* Average Power 0, 4 */
+#define MR_SMC_AVG_PWR_1 0x36 /* Average Power 1, 4 */
+#define MR_SMC_MIN_PWR 0x37 /* Min Power, 4 */
+#define MR_SMC_PWR_TTL_DUR 0x38 /* Power Throttle Duration, 4 */
+#define MR_SMC_PWR_TTL 0x39 /* Power Throttling, 4 */
+#define MR_SMC_PWR_INST 0x3a /* Instantaneous Power Reading, 4 */
+#define MR_SMC_PWR_IMAX 0x3b /* Maximum Power Reading, 4 */
+#define MR_SMC_VOLT_VCCP 0x3c /* VCCP VR Output Voltage, 4 */
+#define MR_SMC_VOLT_VDDQ 0x3d /* VDDQ VR Output Voltage, 4 */
+#define MR_SMC_VOLT_VDDG 0x3e /* VDDG VR Output Voltage, 4 */
+
+#define MR_SMC_TEMP_CPU 0x40 /* CPU DIE Temperature, 4 */
+#define MR_SMC_TEMP_EXHAUST 0x41 /* Card Exhaust Temperature, 4 */
+#define MR_SMC_TEMP_INLET 0x42 /* Card Inlet Temperature, 4 */
+#define MR_SMC_TEMP_VCCP 0x43 /* VCCP VR Temperature, 4 */
+#define MR_SMC_TEMP_VDDG 0x44 /* VDDG VR Temperature, 4 */
+#define MR_SMC_TEMP_VDDQ 0x45 /* VDDQ VR Temperature, 4 */
+#define MR_SMC_TEMP_GDDR 0x46 /* GDDR Temperature, 4 */
+#define MR_SMC_TEMP_EAST 0x47 /* East Temperature, 4 */
+#define MR_SMC_TEMP_WEST 0x48 /* West Temperature, 4 */
+#define MR_SMC_FAN_TACH 0x49 /* Fan RPM, 4 */
+#define MR_SMC_FAN_PWM 0x4a /* Fan PWM Percent, 4 */
+#define MR_SMC_FAN_PWM_ADD 0x4b /* Fan PWM Adder, 4 */
+#define MR_SMC_TCRITICAL 0x4c /* KNC Tcritical temperature, 4 */
+#define MR_SMC_TCONTROL 0x4d /* KNC Tcontrol temperature, 4 */
+#define MR_SMC_TRM_TTL_DUR 0x4e /* Thermal Throttle Duration, 4 */
+#define MR_SMC_TRM_TTL 0x4f /* Thermal Throttling, 4 */
+#define MR_SMC_TRM_PUSH 0x50 /* Target for die temp push, 4 */
+
+#define MR_SMC_PWR_VCCP 0x58 /* VCCP VR Output Power, 4 */
+#define MR_SMC_PWR_VDDQ 0x59 /* VDDQ VR Output Power, 4 */
+#define MR_SMC_PWR_VDDG 0x5a /* VDDG VR Output Power, 4 */
+
+#define MR_SMC_LED_CODE 0x60 /* LED blink code, 4 */
+
+
+/*
+ * Simple I/O access routines for most SMC registers.
+ * All but UUID & SERIAL are 4 bytes in size.
+ */
+#define SMC_TRACK 0
+
+#if SMC_TRACK
+#define RL printk("%s: %2x -> %08x, rtn %d\n", __FUNCTION__, reg, *val, rl)
+#define WL printk("%s: %2x <- %08x, rtn %d\n", __FUNCTION__, reg, *val, rl)
+#else
+#define RL /* As nothing */
+#define WL /* As nothing */
+#endif
+
+#ifdef MIC_IS_EMULATION
+/*
+ * Emulation does not handle I2C busses.
+ * Therefore all code that deals with I2C needs to be
+ * replaced with harmless substitutes in emulation.
+ * The following stubs are for emulation only.
+ */
+int
+gmbus_i2c_read(uint8_t d, uint8_t a, uint8_t r, uint8_t * v, uint16_t l)
+{
+ if (v && l)
+ memset(v, 0, l);
+ return l;
+}
+
+int
+gmbus_i2c_write(uint8_t d, uint8_t a, uint8_t r, uint8_t * v, uint16_t l)
+{
+ return l;
+}
+#endif /* EMULATION */
+
+static char *
+gm_err(int err)
+{
+ char * str = "unknown";
+
+ switch(err) {
+ case -1: str = "timeout"; break;
+ case -2: str = "ack timeout"; break;
+ case -3: str = "interrupted"; break;
+ case -4: str = "invalid command"; break;
+ }
+
+ return str;
+}
+
+
+int
+mr_smc_rd(uint8_t reg, uint32_t * val)
+{
+ int rl;
+
+ mr_smc_deglitch();
+ rl = gmbus_i2c_read(2, MR_SMC_ADDR, reg, (uint8_t *) val, sizeof(*val));
+ RL;
+ if (rl == sizeof(uint32_t))
+ return 0;
+
+ /*
+ * Something failed, do a dummy read to get I2C bus in a known good state.
+ *TBD: Do retries, and if so how many?
+ */
+ printk("smc_rd: error %d (%s), reg %02x\n", rl, gm_err(rl), reg);
+ mr_smc_deglitch();
+ gmbus_i2c_read(2, MR_SMC_ADDR, MR_SMC_FW_VERSION, (uint8_t *) &rl, sizeof(rl));
+ *val = 0;
+ return 1;
+}
+
+int
+mr_smc_wr(uint8_t reg, uint32_t * val)
+{
+ int rl;
+
+ WL;
+ mr_smc_deglitch();
+ rl = gmbus_i2c_write(2, MR_SMC_ADDR, reg, (uint8_t *) val, sizeof(*val));
+ if (rl == sizeof(uint32_t))
+ return 0;
+
+ /*
+ * Something failed, do a dummy read to get I2C bus in a known good state.
+ *TBD: Do retries, and if so how many?
+ */
+ printk("smc_wr: error %d (%s), reg %02x\n", rl, gm_err(rl), reg);
+ mr_smc_deglitch();
+ gmbus_i2c_read(2, MR_SMC_ADDR, MR_SMC_FW_VERSION, (uint8_t *) &rl, sizeof(rl));
+ return 0;
+}
+#undef RL
+#undef WL
+
+
+/*
+ * Bypass for SMC access.
+ * Kind of a backdoor really as it allows for raw access to the SMC which
+ * may be device dependent and vary significantly between SMC firmware
+ * revisions. This is intended for host side tools that (hopefully) know
+ * what they are receiving through this interface. There is a 'set' command
+ * too, which we screen heavily since the SMC controls board cooling and
+ * therefore is critical for the cards safe operation envolope.
+ */
+
+int
+mr_get_smc(void * p)
+{
+ int rtn;
+ uint32_t parm;
+ struct mr_rsp_smc * r;
+
+ parm = * (uint32_t *) p;
+ if (GET_BITS(31, 8, parm))
+ return -MR_ERR_RANGE;
+ r = (struct mr_rsp_smc *) p;
+
+ r->reg = GET_BITS(7, 0, parm);
+
+ /*
+ * These cannot be read by anybody
+ */
+ if (r->reg > MR_SMC_LED_CODE ||
+ r->reg == MR_SMC_ZOMBIE)
+ return -MR_ERR_PERM;
+
+ /*
+ * These can only be read by root
+ */
+ if (! micras_priv)
+ switch(r->reg) {
+ case MR_SMC_SEL_ENTRY_SEL:
+ case MR_SMC_SEL_DATA:
+ case MR_SMC_SDR_ENTRY_SEL:
+ case MR_SMC_SDR_DATA:
+ return -MR_ERR_PERM;
+ }
+
+ /*
+ * Determine how wide the SMC register is
+ */
+ switch(r->reg) {
+ case MR_SMC_UUID:
+ r->width = 16;
+ break;
+ case MR_SMC_SERIAL:
+ r->width = 12;
+ break;
+ default:
+ r->width = 4;
+ }
+
+ mr_smc_deglitch();
+ rtn = gmbus_i2c_read(2, MR_SMC_ADDR, r->reg, (uint8_t *) &r->rtn, r->width);
+#if SMC_TRACK
+ printk("%s: %2x -> %08x, rtn %d\n", __FUNCTION__, r->reg, r->rtn.val, rtn);
+#endif
+ if (rtn != r->width) {
+ /*
+ * Failed once, try one more time
+ *TBD: insert a known good read before the actual retry?
+ */
+ mr_smc_deglitch();
+ rtn = gmbus_i2c_read(2, MR_SMC_ADDR, r->reg, (uint8_t *) &r->rtn, r->width);
+#if SMC_TRACK
+ printk("%s: %2x -> %08x, rtn %d\n", __FUNCTION__, r->reg, r->rtn.val, rtn);
+#endif
+
+ if (r->reg == MR_SMC_SERIAL) {
+ memcpy((uint8_t *) &r->rtn, hwinf.serial, r->width);
+ rtn = r->width;
+ }
+ }
+
+ if (rtn != r->width)
+ return -MR_ERR_SMC;
+
+ return sizeof(*r);
+}
+
+
+int
+mr_set_smc(void * p)
+{
+ uint8_t reg;
+ uint16_t width;
+ int rtn;
+ uint32_t val, parm;
+
+ parm = * (uint32_t *) p;
+ reg = GET_BITS(31, 24, parm);
+
+ /*
+ * Screen for registers we allow setting.
+ * POST register is accessible to everyone,
+ * only root can 'SET' anything beyond that.
+ */
+ if (micras_priv) {
+ switch (reg) {
+ case MR_SMC_CPU_POST:
+ case MR_SMC_SEL_ENTRY_SEL:
+ case MR_SMC_SDR_ENTRY_SEL:
+ case MR_SMC_SMB_RESTRT:
+ case MR_SMC_FORCE_TTL:
+ case MR_SMC_PWR_LIM_0:
+ case MR_SMC_TIME_WIN_0:
+ case MR_SMC_PWR_LIM_1:
+ case MR_SMC_TIME_WIN_1:
+ case MR_SMC_INCL_3V3:
+ case MR_SMC_PWR_LIM_PERS:
+ case MR_SMC_CLAMP_MODE:
+ case MR_SMC_FAN_PWM_ADD:
+ case MR_SMC_LED_CODE:
+ break;
+ default:
+ return -MR_ERR_PERM;
+ }
+ }
+ else {
+ switch (reg) {
+ case MR_SMC_CPU_POST:
+ break;
+ default:
+ return -MR_ERR_PERM;
+ }
+ }
+
+ /*
+ * Screen against known SMC register widths.
+ * We insist that unused upper bits are zeros
+ */
+ switch (reg) {
+ case MR_SMC_SEL_ENTRY_SEL:
+ case MR_SMC_SDR_ENTRY_SEL:
+ case MR_SMC_FAN_PWM_ADD:
+ val = GET_BITS(7, 0, parm); /* 8-bit registers */
+ break;
+ case MR_SMC_PWR_LIM_0:
+ case MR_SMC_TIME_WIN_0:
+ case MR_SMC_PWR_LIM_1:
+ case MR_SMC_TIME_WIN_1:
+ val = GET_BITS(15, 0, parm); /* 16 bit registers */
+ break;
+ case MR_SMC_CPU_POST:
+ val = GET_BITS(23, 0, parm); /* 24 bit registers */
+ break;
+ default:
+ val = GET_BIT(0, parm); /* Booleans */
+ }
+ if (val != GET_BITS(23, 0, parm))
+ return -MR_ERR_INVAUX;
+
+ width = 4;
+ mr_smc_deglitch();
+ rtn = gmbus_i2c_write(2, MR_SMC_ADDR, reg, (uint8_t *) & val, width);
+#if SMC_TRACK
+ printk("%s: %2x <- %08x, rtn %d\n", __FUNCTION__, reg, val, rtn);
+#endif
+ if (rtn != width)
+ return -MR_ERR_SMC;
+
+ return 0;
+}
+
+
+/*
+ * IPMI interface.
+ * The SMC has a connection to the host's board management software, which
+ * usually resides in a dedicated Board Management Controller, of which the
+ * SMC is supposed to be a registered satellite controller (aka. additional
+ * management controller). As such the SMC can receive controls originating
+ * from any valid IPMI session on things like power limits, but it can also
+ * add events to the non-volatile IPMI System Events Log for things like
+ * reporting catastrophic failures that otherwise might be lost because the
+ * main processors might be disabled (section 1.7.6 in IPMI spec 2.0 E5).
+ * In RAS context we'd want to let the SM know if fatal MC events occur
+ * and possibly also if the uOS crashes, such that remote management can
+ * be alerted via standard IPMI mechanisms.
+ *
+ * Input to this routine is an MceInfo record and an 'in-exception context'
+ * flag. It is still TBD what exactly to tell the SMC, but it is expected
+ * that all relevant info is in the MceInfo record.
+ */
+
+void
+micras_mc_ipmi(struct mce_info * mc, int ctx)
+{
+}
+
+
+#if !(USE_SVID || USE_SMC)
+/*
+ * Board voltage sense converter
+ * Two 10 bit read-outs from SBOX register 0x1038.
+ * The format is very poorly documented, so no
+ * warranty on this conversion. Assumption is
+ * the reading is a binary fixed point number.
+ * bit 15 Valid reading if set
+ * bit 9:8 2 bit integer part
+ * bit 7:0 8 bit fraction part
+ * Return value is 0 (invalid) or voltage i uV.
+ */
+
+uint32_t
+bvs2volt(uint16_t sense)
+{
+ uint32_t res, f, msk;
+
+ if (! GET_BIT(15, sense))
+ return 0;
+
+ /*
+ * First get integer contribution
+ * Then accumulate fraction contributions.
+ * Divide and add fraction if corresponding bit set.
+ */
+ res = 1000000 * GET_BITS(9, 8, sense);
+ for(msk = (1 << 7), f = 1000000/2; msk && f; msk >>= 1, f >>= 1)
+ if (sense & msk)
+ res += f;
+
+ return res;
+}
+#endif
+
+
+
+/*
+**
+** Initializations
+**
+** This has two intended purposes:
+** - Do a on-time effort to collect info on properties that
+** are not going to change after the initial setup by
+** either bootstrap or kernel initialization.
+** - Collect initial values on things we can modify.
+** Intent is that unloading the ras module should reset
+** all state to that of the time the module was loaded.
+**
+*/
+
+
+/*
+ *TBD: substitute with official defines when availble.
+ */
+#define KNC_FLASH_TAB 0x0FFF76000 /* Yes, it's below 4GB */
+#define KNC_FLASH_FILT 0x400 /* Correctable MC event filter */
+#define KNC_FLASH_BASE 0x0FFFA8000 /* Yes, it's below 4GB */
+#define KNC_FLASH_SIZE 0x2000 /* 8 KB according to Scott */
+#define KNC_FLASH_BOOT1 0x1274 /* Fboot1 version string */
+#define KNC_FLASH_BOOTB 0x02b8 /* Fboot1 backup version string */
+#define KNC_MP_PHYS 0x9e000 /* Location of MP table */
+#define KNC_MPF_SIG 0xa0afb2a0 /* String "_PM_" inverted */
+#define KNC_MPC_SIG 0x504d4350 /* String "PCMP" */
+
+static void
+get_cpu_table(void)
+{
+ struct mpf_intel * mpf;
+ struct mpc_table * mpc;
+ struct mpc_cpu * mpp;
+ uint8_t * ptr, * ep;
+
+ mpf = phys_to_virt((phys_addr_t) KNC_MP_PHYS);
+ if (mpf) {
+ if (*((uint32_t *) mpf->signature) != KNC_MPF_SIG) {
+ printk("MP FP signature not found, %02x %02x %02x %02x\n",
+ mpf->signature[0], mpf->signature[1],
+ mpf->signature[2], mpf->signature[3]);
+ return;
+ }
+ mpc = phys_to_virt((phys_addr_t) mpf->physptr);
+ if (mpc) {
+ if (*((uint32_t *) mpc->signature) != KNC_MPC_SIG) {
+ printk("MP header signature not found, %02x %02x %02x %02x\n",
+ mpc->signature[0], mpc->signature[1],
+ mpc->signature[2], mpc->signature[3]);
+ return;
+ }
+ ptr = (uint8_t *)(mpc + 1);
+ ep = ptr + mpc->length;
+ while(ptr < ep) {
+ switch(*ptr) {
+ case 0x00: /* CPU */
+ mpp = (struct mpc_cpu *) ptr;
+ if (GET_BIT(0, mpp->cpuflag) && mpp->apicid < nr_cpu_ids)
+ xlat_cpu[mpp->apicid] = GET_BITS(7, 0, mpp->reserved[1]);
+ ptr += 20;
+ break;
+ case 0x01: /* BUS */
+ ptr += 8;
+ break;
+ case 0x02: /* I/O-APIC */
+ ptr += 8;
+ break;
+ case 0x03: /* INT source */
+ ptr += 8;
+ break;
+ case 0x04: /* LINT source */
+ ptr += 8;
+ break;
+ default: /* Table out of spec */
+ ptr = ep;
+ }
+ }
+ }
+#if 0
+ {
+ uint32_t eax, ebx, ecx, edx;
+ uint32_t hwt, i;
+
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+ hwt = GET_BITS(23, 16, ebx);
+ if (hwt > nr_cpu_ids)
+ hwt = nr_cpu_ids;
+ printk("RAS.card: CPU thread table:\n");
+ for(i=0; i < hwt; i++)
+ printk(" cpu %d -> thr %d\n", i, xlat_cpu[i]);
+ }
+#endif
+ }
+}
+
+
+static void __init
+mr_mk_cf_lst(void)
+{
+ int i, n;
+ uint16_t f;
+
+ /*
+ * If PM module interface is in place, then the
+ * core voltage list may already be populated.
+ */
+ if (freq.supt[0] && freq.slen)
+ return;
+
+ n = 0;
+ for(i = ARRAY_SIZE(cpu_tab) -1; i >= 0; i--) {
+ for(f = cpu_tab[i].min_clk;
+ f <= cpu_tab[i].max_clk;
+ f += cpu_tab[i].step_size) {
+ freq.supt[n] = 1000 * f;
+ freq.slen = ++n;
+ if (n >= MR_PTAB_LEN)
+ return;
+ }
+ }
+}
+
+static void __init
+mr_mk_gf_lst(void)
+{
+ int i, n;
+ uint16_t f;
+
+ n = 0;
+ for(i = ARRAY_SIZE(gddr_tab1) -1; i >= 0; i--) {
+ for(f = gddr_tab1[i].min_clk;
+ f <= gddr_tab1[i].max_clk;
+ f += gddr_tab1[i].step_size) {
+ gfreq.supt[n] = 1000 * f;
+ gfreq.slen = ++n;
+ if (n == MR_PTAB_LEN)
+ return;
+ }
+ }
+ for(i = ARRAY_SIZE(gddr_tab2) -1; i >= 0; i--) {
+ for(f = gddr_tab2[i].min_clk;
+ f <= gddr_tab2[i].max_clk;
+ f += gddr_tab2[i].step_size) {
+ gfreq.supt[n] = 1000 * f;
+ gfreq.slen = ++n;
+ if (n == MR_PTAB_LEN)
+ return;
+ }
+ }
+}
+
+/*
+ * We can only list 64 values in this list, but on
+ * a VRM12 device there is 256 values to chose from.
+ * For now we'll list values from 0.7 to 1.3 volt
+ * in 10 mV increments (61 values).
+ */
+
+#define VRM_MIN 600000
+#define VRM_MAX 1300000
+#define VRM_RES 10000
+
+static void __init
+mr_mk_cv_lst(void)
+{
+ int n;
+ uint32_t cv;
+
+ /*
+ * If PM module interface is in place, then the
+ * core voltage list may already be populated.
+ */
+ if (volt.supt[0] && volt.slen)
+ return;
+
+ n = 0;
+ for(cv = VRM_MIN; cv <= VRM_MAX; cv += VRM_RES) {
+ volt.supt[n] = cv;
+ volt.slen = ++n;
+ if (n >= MR_PTAB_LEN)
+ return;
+ }
+}
+
+
+void __init
+mr_mt_card_init(void)
+{
+ uint32_t scr7, scr9, cf;
+ uint32_t smc, ci;
+ int rtn;
+#ifndef MIC_IS_EMULATION
+ uint8_t * parm;
+#endif
+#if ! USE_SMC
+ uint32_t gv;
+#endif
+#if USE_SVID
+ int svid;
+ uint8_t vr;
+#else
+#if ! USE_SMC
+ uint32_t cv;
+#endif
+#endif
+#if USE_PM
+ int (* fnc)(void);
+#endif
+
+ /*
+ * Make CPU->phys ID translation table
+ */
+ get_cpu_table();
+
+ /*
+ * Build numbers for fboot0 and fboot 1 repectively
+ */
+ scr7 = mr_sbox_rl(0, SBOX_SCRATCH7);
+
+ /*
+ * VERS:
+ * Map flash and look for version strings.
+ */
+#ifdef MIC_IS_EMULATION
+ vers.fboot1[0] = scnprintf(vers.fboot1 + 1, MR_VERS_LEN -2,
+ "No emulation flash version string (build %d)",
+ GET_BITS(31, 16, scr7));
+#else
+ parm = ioremap(KNC_FLASH_BASE, KNC_FLASH_SIZE);
+ if (!parm) {
+ printk("mr_mt_card_init: ioremap failure: parm %x\n", KNC_FLASH_BASE);
+ goto fail_iomap;
+ }
+
+ /*
+ * The fboot0 version (hardwired in the chip) is placed in flash
+ * by bootstrap at a fixed location, and is less than 16 byte long.
+ */
+ if (strnlen(parm + KNC_FLASH_BOOT1, 16) < 16)
+ vers.fboot1[0] = scnprintf(vers.fboot1 + 1, MR_VERS_LEN -2,
+ "fboot1 version: %s (build %d)",
+ parm + KNC_FLASH_BOOT1, GET_BITS(31, 16, scr7));
+ else
+ vers.fboot1[0] =scnprintf(vers.fboot1 + 1, MR_VERS_LEN -2,
+ "No valid version string found");
+ iounmap(parm);
+
+ /*
+ * While at it, check if there is a MC filter list in flash
+ */
+ parm = ioremap(KNC_FLASH_TAB, KNC_FLASH_SIZE);
+ if (!parm) {
+ printk("mr_mt_card_init: ioremap failure: parm %x\n", KNC_FLASH_TAB);
+ goto fail_iomap;
+ }
+ mcc_flt_parm(parm + KNC_FLASH_FILT);
+ iounmap(parm);
+
+fail_iomap:
+#endif
+
+ /*
+ * Retrieve ID details from the SMC
+ * UUID, 16 byte
+ * serial, 12 byte
+ * FW version,
+ * 15:0 Build number
+ * 23:16 Minor version
+ * 31:24 Major version
+ * Note: Ancient systems, like Berta, runs on cards with an older
+ * version on the SMC firmware that does not support serial.
+ */
+ mr_smc_deglitch();
+ rtn = gmbus_i2c_read(2, MR_SMC_ADDR, MR_SMC_UUID, hwinf.guid, 16);
+#if SMC_TRACK
+ printk("%s: %2x -> %08x, rtn %d\n", __FUNCTION__, MR_SMC_UUID, *(uint32_t *) hwinf.guid, rtn);
+#endif
+ if (rtn != 16)
+ memset(hwinf.guid, '\0', 16);
+ mr_smc_deglitch();
+ rtn = gmbus_i2c_read(2, MR_SMC_ADDR, MR_SMC_SERIAL, hwinf.serial, 12);
+#if SMC_TRACK
+ printk("%s: %2x -> %08x, rtn %d\n", __FUNCTION__, MR_SMC_SERIAL, *(uint32_t *) hwinf.serial, rtn);
+#endif
+ if (rtn != 12)
+ memcpy(hwinf.serial, "Update_SMC!!", sizeof(hwinf.serial));
+ if (! mr_smc_rd(MR_SMC_FW_VERSION, &smc))
+ vers.fsc[0] = scnprintf(vers.fsc + 1, MR_VERS_LEN -2,
+ "SMC firmware rev. %d.%d (build %d)",
+ GET_BITS(31, 24, smc),
+ GET_BITS(23, 16, smc),
+ GET_BITS(15, 0, smc));
+
+ /*
+ * HWINF:
+ * Get processor details from SBOX componentID.
+ * 19:16 Model ID => aka revision
+ * 15:12 Stepping ID => stepping
+ * 11:8 Substepping ID => substep
+ *
+ * Get Card Revision details from the SMC.
+ * 17:16 board (0=MPI, CRB, SFF, Product)
+ * 10:8 fab version (0='A' .. 7='H')
+ * 2:0 PBA SKU # (need name table here?)
+ */
+ ci = mr_sbox_rl(0, SBOX_COMPONENT_ID);
+ hwinf.rev = GET_BITS(19, 16, ci);
+ hwinf.step = GET_BITS(15, 12, ci);
+ hwinf.substep = GET_BITS(11, 8, ci);
+ if (! mr_smc_rd(MR_SMC_HW_REVISION, &smc)) {
+ hwinf.board = GET_BITS(17, 16, smc);
+ hwinf.fab = GET_BITS(10, 8, smc);
+ hwinf.sku = GET_BITS( 2, 0, smc);
+ }
+
+ /*
+ * VOLT:
+ * By definition, reference voltage is 1st value seen.
+ * Order of preference is SVID, then SMC and lastly SBOX.
+ * SMC register bits 15:0 is voltage in mV.
+ * SBOX_COREVOLT should be in SVID voltage format.
+ */
+#if USE_SVID
+ svid = SvidCmd(SVID_VCCP, VR12Cmd_GetReg, VR12Reg_VID_Set);
+ if (svid >= 0)
+ volt.set = svid2volt(svid);
+#else
+#if USE_SMC
+ if (!mr_smc_rd(MR_SMC_VOLT_VCCP, &smc) && GET_BITS(31, 30, smc) != 0x3)
+ volt.set = GET_BITS(15, 0, smc) * 1000;
+#else
+ cv = mr_sbox_rl(0, SBOX_COREVOLT);
+ volt.set = svid2volt(GET_BITS(7, 0, cv));
+#endif
+#endif
+ mr_mk_cv_lst();
+
+ /*
+ * FREQ
+ * By definition, reference frequency is 1st value seen.
+ */
+ cf = mr_sbox_rl(0, SBOX_COREFREQ);
+ freq.def = mr_mt_cf_r2f(GET_BITS(11, 0, cf));
+ mr_mk_cf_lst();
+
+ /*
+ * GDDR:
+ * See layout of scratch #9 in 'common'.
+ * 26:16 Clock ratio encoding
+ * 27 ClamShell
+ */
+ scr9 = mr_sbox_rl(0, SBOX_SCRATCH9);
+ gddr.speed = 2 * mr_mt_gf_r2f(GET_BITS(26, 16, scr9));
+
+ /*
+ * GVOLT:
+ * Report all values the hardware can set, kind
+ * of silly as these cannot be changed from uOS.
+ * Order of preference is SVID, then SMC and lastly SBOX.
+ * SMC register bits 15:0 is voltage in mV.
+ *
+ *TBD: Seriously suspect SBOX register to be wrong.
+ */
+#if USE_SVID
+ svid = SvidCmd(SVID_VDDQ, VR12Cmd_GetReg, VR12Reg_VID_Set);
+ if (svid >= 0)
+ gvolt.set = svid2volt(svid);
+#else
+#if USE_SMC
+ if (!mr_smc_rd(MR_SMC_VOLT_VDDQ, &smc) && GET_BITS(31, 30, smc) != 0x3)
+ gvolt.set = GET_BITS(15, 0, smc) * 1000;
+#else
+ gv = mr_sbox_rl(0, SBOX_MEMVOLT);
+ gvolt.set = svid2volt(GET_BITS(7, 0, gv));
+#endif
+#endif
+
+ /*
+ * GFREQ:
+ * Report all values the hardware can set, kind
+ * of silly as these cannot be changed from uOS.
+ */
+ gfreq.def = mr_mt_gf_r2f(GET_BITS(26, 16, scr9));
+ mr_mk_gf_lst();
+
+ /*
+ * PWR:
+ * If we are going to use SVID registers we'd need
+ * to know the VRs capabilities and ICC_MAX setting.
+ */
+#if USE_SVID
+ vr = SvidCmd(SVID_VCCP, VR12Cmd_GetReg, VR12Reg_Capability);
+ if (vr >= 0)
+ vccp_cap = vr;
+ vr = SvidCmd(SVID_VDDQ, VR12Cmd_GetReg, VR12Reg_Capability);
+ if (vr >= 0)
+ vddq_cap = vr;
+ vr = SvidCmd(SVID_VDDG, VR12Cmd_GetReg, VR12Reg_Capability);
+ if (vr >= 0)
+ vddg_cap = vr;
+ vr = SvidCmd(SVID_VCCP, VR12Cmd_GetReg, VR12Reg_Icc_Max);
+ if (vr >= 0)
+ vccp_imax = vr;
+ vr = SvidCmd(SVID_VDDQ, VR12Cmd_GetReg, VR12Reg_Icc_Max);
+ if (vr >= 0)
+ vddq_imax = vr;
+ vr = SvidCmd(SVID_VDDG, VR12Cmd_GetReg, VR12Reg_Icc_Max);
+ if (vr >= 0)
+ vddg_imax = vr;
+#endif
+
+ /*
+ * ECC:
+ *
+ *TBD: Where to find ECC setting?
+ * There are several GBOX registers that has something
+ * named ECC in them. Scott to tell once PO is done.
+ */
+ ecc.enable = GET_BIT(29, scr9);
+
+ /*
+ * TRBO
+ * The PM module have the inital turbo mode setting.
+ * Get it now, so we don't need to call PM to report it.
+ */
+#if USE_PM
+ fnc = pm_cb.micpm_get_turbo;
+ if (fnc)
+ trbo.set = fnc();
+#endif
+
+ /*
+ *TBD: Save registers this module may change
+ */
+}
+
+void __exit
+mr_mt_card_exit(void)
+{
+ /*
+ *TBD: Restore registers this module may change
+ */
+}
+
+
+
+/*
+**
+** Card specific 'Get' functions
+**
+*/
+
+int
+mr_get_volt(void * p)
+{
+ struct mr_rsp_volt * r;
+#if USE_PM
+ void (* fnc)(void);
+#endif
+
+ /*
+ * Preference is VR out.
+ * Not sure if board sensors work in KnC
+ */
+#if USE_SVID
+ {
+ int vout;
+
+ vout = SvidCmd(SVID_VCCP, VR12Cmd_GetReg, VR12Reg_VID_Set);
+ if (vout < 0)
+ return vout;
+ volt.set = svid2volt(vout);
+
+ vout = SvidCmd(SVID_VCCP, VR12Cmd_GetReg, VR12Reg_Vout);
+ if (vout < 0)
+ return vout;
+ volt.cur = vout2volt(vout);
+ }
+#else
+#if USE_SMC
+ {
+ uint32_t smc;
+
+ volt.cur = 0;
+ volt.c_val = 3;
+ if (! mr_smc_rd(MR_SMC_VOLT_VCCP, &smc)) {
+ volt.c_val = GET_BITS(31, 30, smc);
+ if (volt.c_val != 0x3)
+ volt.cur = GET_BITS(15, 0, smc) * 1000;
+ }
+
+ /*
+ *TBD: override 'set' value ?
+ */
+ }
+#else
+ {
+ uint32_t fsc, cv;
+
+ cv = mr_sbox_rl(0, SBOX_COREVOLT);
+ volt.set = svid2volt(GET_BITS(7, 0, cv));
+
+ fsc = mr_sbox_rl(0, SBOX_BOARD_VOLTAGE_SENSE);
+ volt.cur = bvs2volt(GET_BITS(15, 0, fsc));
+ }
+#endif
+#endif
+
+#if USE_PM
+ /*
+ * Ask PM for table refresh
+ */
+ fnc = pm_cb.micpm_vf_refresh;
+ if (fnc)
+ fnc();
+#endif
+
+ r = (struct mr_rsp_volt *) p;
+ *r = volt;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_freq(void * p)
+{
+ struct mr_rsp_freq * r;
+ uint32_t cf, cr;
+#if USE_PM
+ void (* fnc)(void);
+#endif
+
+ /*
+ * Current Ratio:
+ * 11:0 Current core ratio
+ * 15 Enable 600 MHz
+ * 27:16 Goal ratio
+ * 31 OC disable
+ * Goal ratio is a product of base ratio and fuse overrides
+ * Current ration is a product of goal, fuse limits and themal throttle
+ *
+ * Core Frequency:
+ * 11:0 Base ratio
+ * 15 Fuse override
+ * 31 Select ratio
+ * Base ratio accepted only if (bit 15 | bit 31 | OC disble) == 010
+ *
+ *TBD: How to detect clock bypasses?
+ * ICC bypass cuts the core and reference base in half.
+ */
+ cr = mr_sbox_rl(0, SBOX_CURRENTRATIO);
+ cf = mr_sbox_rl(0, SBOX_COREFREQ);
+ freq.cur = mr_mt_cf_r2f(GET_BITS(11, 0, cr));
+ freq.def = mr_mt_cf_r2f(GET_BITS(11, 0, cf));
+ if (GET_BITS(11, 0, cf) != GET_BITS(11, 0, cr))
+ printk("RAS.get_freq: core not running at expected frequency\n");
+
+#if USE_PM
+ /*
+ * Ask PM for table refresh
+ */
+ fnc = pm_cb.micpm_vf_refresh;
+ if (fnc)
+ fnc();
+#endif
+
+ r = (struct mr_rsp_freq *) p;
+ *r = freq;
+ return sizeof(*r);
+}
+
+
+#if USE_SVID
+int
+mr_get_svid(uint8_t vr, uint8_t cap, uint8_t imax, struct mr_rsp_vrr * vrr)
+{
+ int v, a, p;
+
+ p = SvidCmd(vr, VR12Cmd_GetReg, VR12Reg_Pout);
+ a = SvidCmd(vr, VR12Cmd_GetReg, VR12Reg_Iout);
+ v = SvidCmd(vr, VR12Cmd_GetReg, VR12Reg_Vout);
+
+ if (p < 0 || a < 0 || v < 0)
+ return -MR_ERR_SMC;
+
+ vrr->pwr = pout2watt(p);
+ vrr->cur = iout2amp(a, cap, imax);
+ vrr->volt = vout2volt(v);
+
+ return 0;
+}
+#endif
+
+#define KNC_DFF_BOARD 2 /* DFF/SFF board */
+
+int
+mr_get_power(void * p)
+{
+ struct mr_rsp_power * r;
+#if USE_SMC
+ static struct mr_rsp_vrr vnil = { 0, 0, 0, 3, 3, 3 };
+ static struct mr_rsp_pws pnil = { 0, 3 };
+ uint32_t vccp, vddg, vddq;
+ uint32_t prd0, prd1, pcie, p2x3, p2x4;
+#endif
+
+#if USE_SVID
+ /*
+ * Get VR status over SVID.
+ */
+ if (mr_get_svid(SVID_VCCP, vccp_cap, vccp_imax, &power.vccp) < 0 ||
+ mr_get_svid(SVID_VDDQ, vddq_cap, vddq_imax, &power.vddq) < 0 ||
+ mr_get_svid(SVID_VDDG, vddg_cap, vddg_imax, &power.vddq) < 0)
+ return -MR_ERR_SMC;
+#else
+#if USE_SMC
+ /*
+ * Get VR status from SMC.
+ * Only voltages are available currently.
+ * Still need to screen for good data.
+ * Top 2 bits decode as
+ * 00 Data OK
+ * 01 Upper threshold reached
+ * 10 Lower threshold reached
+ * 11 Data unavailable
+ * Assume data is valid even if a threshold reached
+ */
+ power.vccp = power.vddg = power.vddq = vnil;
+ if (! mr_smc_rd(MR_SMC_VOLT_VCCP, &vccp)) {
+ power.vccp.v_val = GET_BITS(31, 30, vccp);
+ if (power.vccp.v_val != 0x3)
+ power.vccp.volt = 1000 * GET_BITS(15, 0, vccp);
+ }
+ if (! mr_smc_rd(MR_SMC_VOLT_VDDG, &vddg)) {
+ power.vddg.v_val = GET_BITS(31, 30, vddg);
+ if (power.vddg.v_val != 0x3)
+ power.vddg.volt = 1000 * GET_BITS(15, 0, vddg);
+ }
+ if (! mr_smc_rd(MR_SMC_VOLT_VDDQ, &vddq)) {
+ power.vddq.v_val = GET_BITS(31, 30, vddq);
+ if (power.vddq.v_val != 0x3)
+ power.vddq.volt = 1000 * GET_BITS(15, 0, vddq);
+ }
+ if (! mr_smc_rd(MR_SMC_PWR_VCCP, &vccp)) {
+ power.vccp.p_val = GET_BITS(31, 30, vccp);
+ if (power.vccp.p_val != 0x3)
+ power.vccp.pwr = 1000000 * GET_BITS(15, 0, vccp);
+ }
+ if (! mr_smc_rd(MR_SMC_PWR_VDDG, &vddg)) {
+ power.vddg.p_val = GET_BITS(31, 30, vddg);
+ if (power.vddg.p_val != 0x3)
+ power.vddg.pwr = 1000000 * GET_BITS(15, 0, vddg);
+ }
+ if (! mr_smc_rd(MR_SMC_PWR_VDDQ, &vddq)) {
+ power.vddq.p_val = GET_BITS(31, 30, vddq);
+ if (power.vddq.p_val != 0x3)
+ power.vddq.pwr = 1000000 * GET_BITS(15, 0, vddq);
+ }
+#endif
+#endif
+
+#if USE_SMC
+ /*
+ * Get reads on VRs and power sensors from SMC.
+ * This is a mess:
+ * - total power may or may not include 3.3 V rail.
+ * If it is then it's not measured, just "guessed".
+ * - there are two averaging windows for total power,
+ * though it is not clear who controls these windows.
+ * For now we assume window 0 is shorter than window 1
+ * and thus power 0 is 'current' reading and power 1
+ * is the '20 sec' reading.
+ * TBD: Who controls the time windows and is is true
+ * that Window 0 is shorter than Window 1?
+ * - No specifics on how power sensors are averaged,
+ * i.e. is Window 0/1 used or is is a third window.
+ * Need to know, otherwise Ptot may not be sum(sources).
+ * - There still is no 'max' value from SMC
+ *
+ * Still need to screen for good data.
+ * Top 2 bits decode as
+ * 00 Data OK
+ * 01 Upper threshold reached
+ * 10 Lower threshold reached
+ * 11 Data unavailable
+ * Assume data is valid even if a threshold reached
+ */
+ power.tot0 = power.tot1 =
+ power.inst = power.imax =
+ power.pcie = power.c2x3 = power.c2x4 = pnil;
+
+ if (! mr_smc_rd(MR_SMC_AVG_PWR_0, &prd0)) {
+ power.tot0.p_val = GET_BITS(31, 30, prd0);
+ if (power.tot0.p_val != 0x3)
+ power.tot0.prr = 1000000 * GET_BITS(29, 0, prd0);
+ }
+ if (! mr_smc_rd(MR_SMC_AVG_PWR_1, &prd1)) {
+ power.tot1.p_val = GET_BITS(31, 30, prd1);
+ if (power.tot1.p_val != 0x3)
+ power.tot1.prr = 1000000 * GET_BITS(29, 0, prd1);
+ }
+ power.inst = power.imax = pnil;
+ if (! mr_smc_rd(MR_SMC_PWR_INST, &prd0)) {
+ power.inst.p_val = GET_BITS(31, 30, prd0);
+ if (power.inst.p_val != 0x3)
+ power.inst.prr = 1000000 * GET_BITS(29, 0, prd0);
+ }
+ if (! mr_smc_rd(MR_SMC_PWR_IMAX, &prd1)) {
+ power.imax.p_val = GET_BITS(31, 30, prd1);
+ if (power.imax.p_val != 0x3)
+ power.imax.prr = 1000000 * GET_BITS(29, 0, prd1);
+ }
+ if (! mr_smc_rd(MR_SMC_PWR_PCIE, &pcie)) {
+ power.pcie.p_val = GET_BITS(31, 30, pcie);
+ if (power.pcie.p_val != 0x3)
+ power.pcie.prr = 1000000 * GET_BITS(15, 0, pcie);
+ }
+ if (hwinf.board != KNC_DFF_BOARD) {
+ if (! mr_smc_rd(MR_SMC_PWR_2X3, &p2x3)) {
+ power.c2x3.p_val = GET_BITS(31, 30, p2x3);
+ if (power.c2x3.p_val != 0x3)
+ power.c2x3.prr = 1000000 * GET_BITS(15, 0, p2x3);
+ }
+ if (! mr_smc_rd(MR_SMC_PWR_2X4, &p2x4)) {
+ power.c2x4.p_val = GET_BITS(31, 30, p2x4);
+ if (power.c2x4.p_val != 0x3)
+ power.c2x4.prr = 1000000 * GET_BITS(15, 0, p2x4);
+ }
+ }
+#endif
+
+ r = (struct mr_rsp_power *) p;
+ *r = power;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_plim(void * p)
+{
+ uint32_t pl0, pl1, grd;
+ struct mr_rsp_plim * r;
+
+ /*
+ * Get values from PM
+ */
+ if (! mr_smc_rd(MR_SMC_PWR_LIM_0, &pl0))
+ plim.hmrk = GET_BITS(15, 0, pl0);
+
+ if (! mr_smc_rd(MR_SMC_PWR_LIM_1, &pl1))
+ plim.lmrk = GET_BITS(15, 0, pl1);
+
+ if (! mr_smc_rd(MR_SMC_PWR_LIM0_GRD, &grd))
+ plim.phys = plim.hmrk + GET_BITS(15, 0, grd);
+
+ r = (struct mr_rsp_plim *) p;
+ *r = plim;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_gfreq(void * p)
+{
+ struct mr_rsp_gfreq * r;
+ uint32_t gbr;
+
+ /*
+ * SBOX register MEMFREQ bits 7:0 now holds 10 x rate in GTps.
+ */
+ gbr = mr_sbox_rl(0, SBOX_MEMORYFREQ);
+ gfreq.cur = GET_BITS(7, 0, gbr) * 100000 / 2;
+
+ r = (struct mr_rsp_gfreq *) p;
+ *r = gfreq;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_gvolt(void * p)
+{
+ struct mr_rsp_gvolt * r;
+
+ /*
+ * Preference is VR out.
+ * Not sure if board sensors work in KnC
+ */
+#if USE_SVID
+ {
+ int vout;
+
+ vout = SvidCmd(SVID_VDDQ, VR12Cmd_GetReg, VR12Reg_VID_Set);
+ if (vout < 0)
+ return vout;
+ gvolt.set = svid2volt(vout);
+
+ vout = SvidCmd(SVID_VDDQ, VR12Cmd_GetReg, VR12Reg_Vout);
+ if (vout < 0)
+ return vout;
+ gvolt.cur = vout2volt(vout);
+ }
+#else
+#if USE_SMC
+ {
+ uint32_t smc;
+
+ gvolt.cur = 0;
+ gvolt.c_val = 3;
+ if (! mr_smc_rd(MR_SMC_VOLT_VDDQ, &smc)) {
+ gvolt.c_val = GET_BITS(31, 30, smc);
+ if (gvolt.c_val != 0x3)
+ gvolt.cur = GET_BITS(15, 0, smc) * 1000;
+ }
+ if (!gvolt.set)
+ gvolt.set = gvolt.cur;
+ }
+#else
+ {
+ uint32_t bvs;
+
+ bvs = mr_sbox_rl(0, SBOX_BOARD_VOLTAGE_SENSE);
+ gvolt.cur = bvs2volt(GET_BITS(31, 16, bvs));
+ }
+#endif
+#endif
+
+ r = (struct mr_rsp_gvolt *) p;
+ *r = gvolt;
+ return sizeof(*r);
+}
+
+
+/*
+ * Card has 3 dedicated temp sensors (read from SMC):
+ * 0 Air Inlet (aka West)
+ * 1 Air exhaust (aka East)
+ * 2 GDDR memory (not sure which chip)
+ *
+ * VRs can measure temperature too, which may be read
+ * from SMC (via I2C bus) or the VRs directly (via SVID).
+ * 3 Vccp VR (IR3538) temp
+ * 4 Vddq VR (IR3541, loop 1) temp
+ * 5 Vddg VR (IR3541, loop 2) temp
+ * Note: Vddg and Vddq are measured on the same VR,
+ * likely will be the same reading (or very close).
+ *
+ * SBOX board temperature sensors are not connected
+ * in KnC (SBOX HAS vol 1, section 1.40.1). Instead it
+ * relies on SMC to 'broadcast' sensor telemetry into
+ * the KnC's TMU unit via it's I2C bus.
+ * Currently it doesn't, though a DCR has been filed.
+ */
+
+int
+mr_get_temp(void * p)
+{
+ struct mr_rsp_temp * r;
+ uint32_t die1, die2, die3; /* Die temps */
+ uint32_t dmx1, dmx2, dmx3; /* Max die temps */
+#if USE_SVID
+ int tvccp, tvddq, tvddg; /* VR temps */
+#endif
+#if USE_SMC
+ static struct mr_rsp_tsns tnil = { 0, 3 };
+#endif
+
+#if USE_SVID
+ /*
+ * Get VR temperatures over SVID.
+ * These are _all_ positive numbers.
+ */
+ tvccp = SvidCmd(SVID_VCCP, VR12Cmd_GetReg, VR12Reg_Temp);
+ tvddq = SvidCmd(SVID_VDDQ, VR12Cmd_GetReg, VR12Reg_Temp);
+ tvddg = SvidCmd(SVID_VDDG, VR12Cmd_GetReg, VR12Reg_Temp);
+ if (tvccp < 0 || tvddq < 0 || tvddg < 0)
+ return -MR_ERR_SMC;
+ temp.vccp.cur = GET_BITS(7, 0, tvccp);
+ temp.vddq.cur = GET_BITS(7, 0, tvddq);
+ temp.vddg.cur = GET_BITS(7, 0, tvddg);
+#endif
+
+#if USE_SMC
+ /*
+ * Get temp sensor readings from SMC.
+ * According to MAS 0.30 it presents
+ * - CPU die temp (just one value)
+ * - Fan exhaust temp
+ * - Fan inlet temp
+ * - Vccp VR temp
+ * - Vddg VR temp
+ * - Vddq VR temp
+ * - GDDR temp
+ *
+ * Still need to screen for good data.
+ * Top 2 bits decode as
+ * 00 Data OK
+ * 01 Upper threshold reached
+ * 10 Lower threshold reached
+ * 11 Data unavailable
+ * Assume data is valid even if a threshold reached
+ */
+ {
+ uint32_t fin, fout, gddr; /* Sensor temps */
+ uint32_t vccp, vddg, vddq; /* VR temps */
+ uint32_t die; /* Die summary */
+
+ temp.die = temp.fin = temp.fout =
+ temp.vccp = temp.vddg = temp.vddq = tnil;
+ if (! mr_smc_rd(MR_SMC_TEMP_CPU, &die)) {
+ temp.die.c_val = GET_BITS(31, 30, die);
+ if (temp.die.c_val != 0x3)
+ temp.die.cur = GET_BITS(15, 0, die);
+ }
+ if (! mr_smc_rd(MR_SMC_TEMP_EXHAUST, &fout)) {
+ temp.fout.c_val = GET_BITS(31, 30, fout);
+ if (temp.fout.c_val != 0x3)
+ temp.fout.cur = GET_BITS(15, 0, fout);
+ }
+ if (! mr_smc_rd(MR_SMC_TEMP_INLET, &fin)) {
+ temp.fin.c_val = GET_BITS(31, 30, fin);
+ if (temp.fin.c_val != 0x3)
+ temp.fin.cur = GET_BITS(15, 0, fin);
+ }
+ if (! mr_smc_rd(MR_SMC_TEMP_VCCP, &vccp)) {
+ temp.vccp.c_val = GET_BITS(31, 30, vccp);
+ if (temp.vccp.c_val != 0x3)
+ temp.vccp.cur = GET_BITS(15, 0, vccp);
+ }
+ if (! mr_smc_rd(MR_SMC_TEMP_VDDG, &vddg)) {
+ temp.vddg.c_val = GET_BITS(31, 30, vddg);
+ if (temp.vddg.c_val != 0x3)
+ temp.vddg.cur = GET_BITS(15, 0, vddg);
+ }
+ if (! mr_smc_rd(MR_SMC_TEMP_VDDQ, &vddq)) {
+ temp.vddq.c_val = GET_BITS(31, 30, vddq);
+ if (temp.vddq.c_val != 0x3)
+ temp.vddq.cur = GET_BITS(15, 0, vddq);
+ }
+ if (! mr_smc_rd(MR_SMC_TEMP_GDDR, &gddr)) {
+ temp.gddr.c_val = GET_BITS(31, 30, gddr);
+ if (temp.gddr.c_val != 0x3)
+ temp.gddr.cur = GET_BITS(15, 0, gddr);
+ }
+ }
+#else
+ /*
+ * The TMU registers relies on telemetry broadcasts from
+ * the SMC in order to report current data, early SMC
+ * firmware does not provide telemetry at all.
+ * Mapping of 'board temps' to physical sensors isn't
+ * really defined anywhere. Based on FreeBSD comments
+ * they map is:
+ * 0 Air Inlet
+ * 1 VCCP VR
+ * 2 GDDR (not sure which chip)
+ * 3 GDDR VR
+ *
+ *TBD: verify map on actual CRB
+ */
+ {
+ uint32_t btr1, btr2; /* Board temps */
+ uint32_t tsta; /* Thermal status */
+ uint32_t fsc; /* Fan controller status */
+
+ fsc = mr_sbox_rl(0, SBOX_STATUS_FAN2);
+ btr1 = mr_sbox_rl(0, SBOX_BOARD_TEMP1);
+ btr2 = mr_sbox_rl(0, SBOX_BOARD_TEMP2);
+ tsta = mr_sbox_rl(0, SBOX_THERMAL_STATUS);
+ temp.fin.cur = (btr1 & (1 << 15)) ? GET_BITS( 8, 0, btr1) : 0;
+ temp.vccp.cur = (btr1 & (1 << 31)) ? GET_BITS(24, 16, btr1) : 0;
+ temp.gddr.cur = (btr2 & (1 << 15)) ? GET_BITS( 8, 0, btr2) : 0;
+ temp.vddq.cur = (btr2 & (1 << 31)) ? GET_BITS(24, 16, btr2) : 0;
+ temp.vddg.cur = GET_BITS(19, 12, fsc);
+ temp.brd.cur = 0;
+ if (temp.fin.cur > temp.brd.cur)
+ temp.brd.cur = temp.fin.cur;
+ if (temp.vccp.cur > temp.brd.cur)
+ temp.brd.cur = temp.vccp.cur;
+ if (temp.gddr.cur > temp.brd.cur)
+ temp.brd.cur = temp.gddr.cur;
+ if (temp.vddq.cur > temp.brd.cur)
+ temp.brd.cur = temp.vddq.cur;
+ if (tsta & (1 << 31))
+ temp.die.cur = GET_BITS(30, 22, tsta);
+ }
+#endif
+
+ /*
+ * Raw SBOX data for die temperatures.
+ *
+ *TBD: do these depend on SMC telemetry?
+ * If so they probably won't work until DCR in place.
+ */
+ die1 = mr_sbox_rl(0, SBOX_CURRENT_DIE_TEMP0);
+ die2 = mr_sbox_rl(0, SBOX_CURRENT_DIE_TEMP1);
+ die3 = mr_sbox_rl(0, SBOX_CURRENT_DIE_TEMP2);
+ dmx1 = mr_sbox_rl(0, SBOX_MAX_DIE_TEMP0);
+ dmx2 = mr_sbox_rl(0, SBOX_MAX_DIE_TEMP1);
+ dmx3 = mr_sbox_rl(0, SBOX_MAX_DIE_TEMP2);
+
+ /*
+ * Die temperatures.
+ * Always positive numbers (or zero for unfused parts)
+ */
+ temp.dies[0].cur = GET_BITS( 9, 0, die1);
+ temp.dies[1].cur = GET_BITS(19, 10, die1);
+ temp.dies[2].cur = GET_BITS(29, 20, die1);
+ temp.dies[3].cur = GET_BITS( 9, 0, die2);
+ temp.dies[4].cur = GET_BITS(19, 10, die2);
+ temp.dies[5].cur = GET_BITS(29, 20, die2);
+ temp.dies[6].cur = GET_BITS( 9, 0, die3);
+ temp.dies[7].cur = GET_BITS(19, 10, die3);
+ temp.dies[8].cur = GET_BITS(29, 20, die3);
+
+ /*
+ * Die max temp (probably 0 for unfused parts)
+ */
+ temp.dies[0].max = GET_BITS( 9, 0, dmx1);
+ temp.dies[1].max = GET_BITS(19, 10, dmx1);
+ temp.dies[2].max = GET_BITS(29, 20, dmx1);
+ temp.dies[3].max = GET_BITS( 9, 0, dmx2);
+ temp.dies[4].max = GET_BITS(19, 10, dmx2);
+ temp.dies[5].max = GET_BITS(29, 20, dmx2);
+ temp.dies[6].max = GET_BITS( 9, 0, dmx3);
+ temp.dies[7].max = GET_BITS(19, 10, dmx3);
+ temp.dies[8].max = GET_BITS(29, 20, dmx3);
+
+ r = (struct mr_rsp_temp *) p;
+ *r = temp;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_fan(void * p)
+{
+ struct mr_rsp_fan * r;
+ uint32_t fs, fp;
+#if USE_SMC
+ uint32_t fa;
+#endif
+
+ r = (struct mr_rsp_fan *) p;
+
+ /*
+ * Preference is SMC data.
+ * Not sure if SBOX registers work sensors work in KnC
+ */
+#if USE_SMC
+ /*
+ * Read fan state from SMC.
+ * No info on override available.
+ */
+ r->override = 0;
+ r->r_val = r->p_val = 3;
+ if (mr_smc_rd(MR_SMC_FAN_TACH, &fs))
+ fs = PUT_BITS(31, 30, 3);
+ if (mr_smc_rd(MR_SMC_FAN_PWM, &fp))
+ fp = PUT_BITS(31, 30, 3);
+ if (mr_smc_rd(MR_SMC_FAN_PWM_ADD, &fa))
+ fa = PUT_BITS(31, 30, 3);
+
+ /*
+ * Still need to screen for good data.
+ * Top 2 bits decode as
+ * 00 Data OK
+ * 01 Reserved
+ * 10 Lower threshold reached (or reserved)
+ * 11 Data unavailable
+ * Assume data is still valid if a threshold reached
+ */
+ if (GET_BITS(31, 30, fs) != 0x3) {
+ /*
+ * The override concept from KnF (and SBOX registers)
+ * seems to have been replaced with a PWM adder.
+ * Propose to set override flag if adder is non-zero.
+ */
+ r->r_val = 0;
+ r->rpm = GET_BITS(15, 0, fs);
+ if (GET_BITS(31, 30, fp) != 0x3) {
+ r->p_val = 0;
+ r->pwm = GET_BITS(7, 0, fp);
+ if (GET_BITS(31, 30, fa) != 0x3) {
+ fa = GET_BITS(7, 0, fa);
+ if (fa) {
+ r->override = 1;
+ r->pwm += fa;
+ if (r->pwm > 100)
+ r->pwm = 100;
+ }
+ }
+ }
+ }
+#else
+ /*
+ * Read fan state from SBOX registers
+ * Require SMC telemetry to work.
+ */
+ fs = mr_sbox_rl(0, SBOX_STATUS_FAN1);
+ fp = mr_sbox_rl(0, SBOX_SPEED_OVERRIDE_FAN);
+
+ r->override = GET_BIT(15, fp);
+ r->rpm = GET_BITS(15, 0, fs);
+ if (r->override)
+ r->pwm = GET_BITS( 7, 0, fp);
+ else
+ r->pwm = GET_BITS(23, 16, fs);
+#endif
+
+ return sizeof(*r);
+}
+
+
+int
+mr_get_ecc(void * p)
+{
+ struct mr_rsp_ecc * r;
+
+ r = (struct mr_rsp_ecc *) p;
+ *r = ecc;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_trbo(void * p)
+{
+ struct mr_rsp_trbo * r;
+
+ /*
+ * Get current value from PM
+ */
+#if USE_PM
+ int (* fnc)(void);
+
+ fnc = pm_cb.micpm_get_turbo;
+ if (fnc) {
+ uint32_t pm;
+
+ pm = fnc();
+ trbo.state = GET_BIT(1, pm);
+ trbo.avail = GET_BIT(2, pm);
+ if (! trbo.avail)
+ trbo.set = 0;
+ }
+#endif
+
+ r = (struct mr_rsp_trbo *) p;
+ *r = trbo;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_pmcfg(void * p)
+{
+ struct mr_rsp_pmcfg * r;
+
+#if USE_PM
+ int (* fnc)(void);
+
+ fnc = pm_cb.micpm_get_pmcfg;
+ if (fnc)
+ pmcfg.mode = fnc();
+#endif
+
+ r = (struct mr_rsp_pmcfg *) p;
+ *r = pmcfg;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_led(void * p)
+{
+ struct mr_rsp_led * r;
+ uint32_t led;
+
+ if (mr_smc_rd(MR_SMC_LED_CODE, &led))
+ return -MR_ERR_SMC;
+
+ r = (struct mr_rsp_led *) p;
+ r->led = GET_BIT(0, led);
+ return sizeof(*r);
+}
+
+
+int
+mr_get_prochot(void * p)
+{
+ struct mr_rsp_ptrig * r;
+ uint32_t pwr0;
+ uint32_t time0;
+
+ if (mr_smc_rd(MR_SMC_PWR_LIM_0, &pwr0) ||
+ mr_smc_rd(MR_SMC_TIME_WIN_0, &time0))
+ return -MR_ERR_SMC;
+
+ r = (struct mr_rsp_ptrig *) p;
+ r->power = GET_BITS(15, 0, pwr0);
+ r->time = GET_BITS(15, 0, time0);
+ return sizeof(*r);
+}
+
+
+int
+mr_get_pwralt(void * p)
+{
+ struct mr_rsp_ptrig * r;
+ uint32_t pwr1;
+ uint32_t time1;
+
+ if (mr_smc_rd(MR_SMC_PWR_LIM_1, &pwr1) ||
+ mr_smc_rd(MR_SMC_TIME_WIN_1, &time1))
+ return -MR_ERR_SMC;
+
+ r = (struct mr_rsp_ptrig *) p;
+ r->power = GET_BITS(15, 0, pwr1);
+ r->time = GET_BITS(15, 0, time1);
+ return sizeof(*r);
+}
+
+
+int
+mr_get_perst(void * p)
+{
+ struct mr_rsp_perst * r;
+ uint32_t perst;
+
+ if (mr_smc_rd(MR_SMC_PWR_LIM_PERS, &perst))
+ return -MR_ERR_SMC;
+
+ r = (struct mr_rsp_perst *) p;
+ r->perst = GET_BIT(0, perst);
+ return sizeof(*r);
+}
+
+
+int
+mr_get_ttl(void * p)
+{
+ struct mr_rsp_ttl * r;
+
+ r = (struct mr_rsp_ttl *) p;
+
+#if USE_PM
+ mr_pm_ttl(r);
+#endif
+
+ return sizeof(*r);
+}
+
+
+/*
+**
+** Card specific 'Set' functions
+** Input screening takes place here (to the extent possible).
+**
+*/
+
+
+int
+mr_set_volt(void * p)
+{
+#if USE_SVID
+ uint32_t err, val;
+ uint8_t svid;
+
+ /*
+ * Ensure it's a supported value
+ * Which limits to use, physical or PM list?
+ */
+ val = *(uint32_t *) p;
+ svid = volt2svid(val);
+#if 1
+ {
+ if (!svid)
+ return -MR_ERR_RANGE;
+ }
+#else
+ {
+ int i;
+
+ for(i = 0; i < MR_PTAB_LEN; i++)
+ if (volt.supt[i] == val)
+ break;
+ if (i == MR_PTAB_LEN)
+ return -MR_ERR_RANGE;
+ }
+#endif
+
+ /*
+ * Read-modify-write the core voltage VID register
+ */
+ err = SvidCmd(SVID_VCCP, VR12Cmd_SetVID_Slow, svid);
+ printk("SetVolt: %d -> %08x (err %08x)\n", val, svid, err);
+
+ return err ? -MR_ERR_SMC : 0;
+#else
+ return -MR_ERR_INVOP;
+#endif
+}
+
+
+int
+mr_set_freq(void * p)
+{
+ uint32_t cf, msk, new, val;
+ uint16_t rat;
+ int i;
+
+ /*
+ * Ensure it's a supported value
+ */
+ val = *(uint32_t *) p;
+ for(i = 0; i < MR_PTAB_LEN; i++)
+ if (freq.supt[i] == val)
+ break;
+ if (i == MR_PTAB_LEN)
+ return -MR_ERR_RANGE;
+
+ /*
+ * Core Frequency:
+ * 11:0 Base ratio
+ * 15 Fuse override
+ * 31 Select ratio
+ * Base ratio accepted only if (bit 15 | bit 31 | OC disble) == 010
+ * Pre-scale frequency to counter for any ICC trickery.
+ * Not nice, makes exact table matches difficult!!
+ */
+ val = (val * icc_fwd()) / ICC_NOM;
+ rat = freq2ratio(val/1000, cpu_tab, ARRAY_SIZE(cpu_tab), 200);
+ cf = mr_sbox_rl(0, SBOX_COREFREQ);
+ msk = ~(PUT_BITS(11, 0, ~0) | PUT_BIT(15, 1) | PUT_BIT(31, 1));
+ new = (cf & msk) | PUT_BITS(11, 0, rat) | PUT_BIT(31, 1);
+ mr_sbox_wl(0, SBOX_COREFREQ, new);
+ printk("SetFreq: %d -> %08x (%08x)\n", val, new, cf);
+
+ /*
+ *TBD:
+ * We just changed the system's base clock without
+ * re-calibrating the APIC timer tick counters.
+ * There is probably a function call for the cpu-freq
+ * driver to deal with this, so should we call it?
+ */
+
+ return 0;
+}
+
+
+int
+mr_set_plim(void * p)
+{
+ plim.phys = *(uint32_t *) p;
+
+ /*
+ * Notify PM of change
+ *TBD: not supported, remove?
+ */
+ return 0;
+}
+
+
+int
+mr_set_fan(void * p)
+{
+ struct mr_set_fan * fc;
+
+ /*
+ * Ensure operation is valid, i.e. no garbage
+ * in override flag (only 1 and 0 allowed) and
+ * that pwm in in range 0 through 99.
+ */
+ fc = (struct mr_set_fan *) p;
+ if (GET_BITS(7, 1, fc->override) || fc->pwm >= 100)
+ return -MR_ERR_RANGE;
+
+#if USE_SMC
+ {
+ uint32_t dat;
+
+ /*
+ * Determine the PWM-adder value, and send it to the SMC.
+ * Subsequent 'GET' fan will add the calculated PWM and
+ * this adder to report current PWM percentage.
+ * Only way to retrieve the adder is via GET_SMC(0x4b).
+ */
+ if (fc->override)
+ dat = fc->pwm;
+ else
+ dat = 0;
+
+ if (mr_smc_wr(MR_SMC_FAN_PWM_ADD, &dat))
+ return -MR_ERR_SMC;
+ }
+#else
+ /*
+ * Read-modify-write the fan override register
+ * Control of fan #1 only, don't touch #2
+ * Note: require SMC to support SBOX registers
+ * which is not on the radar right now.
+ */
+ {
+ uint32_t fcor, fco1, fco2;
+
+ fcor = mr_sbox_rl(0, SBOX_SPEED_OVERRIDE_FAN);
+ fco2 = GET_BITS(31, 16, fcor);
+ if (fc->override)
+ fco1 = PUT_BIT(15, 1) | fc->pwm;
+ else
+ fco1 = 0;
+ mr_sbox_wl(0, SBOX_SPEED_OVERRIDE_FAN,
+ PUT_BITS(31, 16, fco2) | PUT_BITS(15, 0, fco1));
+ }
+#endif
+
+ return 0;
+}
+
+
+int
+mr_set_trbo(void * p)
+{
+ uint32_t tmp;
+#if USE_PM
+ void (* fnc)(int);
+#endif
+
+ /*
+ * Only values 0 and 1 allowed
+ */
+ tmp = *(uint32_t *) p;
+ if (GET_BITS(31, 1, tmp))
+ return -MR_ERR_RANGE;
+ trbo.set = tmp;
+
+#if USE_PM
+ /*
+ * Notify PM of new value
+ */
+ fnc = pm_cb.micpm_set_turbo;
+ if (fnc)
+ fnc(trbo.set);
+#endif
+
+ return 0;
+}
+
+
+int
+mr_set_led(void * p)
+{
+ uint32_t led;
+
+ /*
+ * Only values 0 and 1 allowed
+ */
+ led = *(uint32_t *) p;
+ if (GET_BITS(31, 1, led))
+ return -MR_ERR_RANGE;
+
+ if (mr_smc_wr(MR_SMC_LED_CODE, &led))
+ return -MR_ERR_SMC;
+
+ return 0;
+}
+
+
+int
+mr_set_prochot(void * p)
+{
+ struct mr_rsp_ptrig * trig;
+ uint32_t pwr0;
+ uint32_t time0;
+
+ trig = (struct mr_rsp_ptrig *) p;
+ pwr0 = trig->power;
+ time0 = trig->time;
+
+ /*
+ * Check for sane values
+ *TBD: check pwr0 higher than current pwr1?
+ */
+ if (pwr0 < 50 || pwr0 > 400)
+ return -MR_ERR_RANGE;
+ if (time0 < 50 || time0 > 1000)
+ return -MR_ERR_RANGE;
+
+ if (mr_smc_wr(MR_SMC_PWR_LIM_0, &pwr0) ||
+ mr_smc_wr(MR_SMC_TIME_WIN_0, &time0))
+ return -MR_ERR_SMC;
+
+ return 0;
+}
+
+
+int
+mr_set_pwralt(void * p)
+{
+ struct mr_rsp_ptrig * trig;
+ uint32_t pwr1;
+ uint32_t time1;
+
+ trig = (struct mr_rsp_ptrig *) p;
+ pwr1 = trig->power;
+ time1 = trig->time;
+
+ /*
+ * Check for sane values
+ *TBD: check pwr1 lower than current pwr0?
+ */
+ if (pwr1 < 50 || pwr1 > 400)
+ return -MR_ERR_RANGE;
+ if (time1 < 50 || time1 > 1000)
+ return -MR_ERR_RANGE;
+
+ if (mr_smc_wr(MR_SMC_PWR_LIM_1, &pwr1) ||
+ mr_smc_wr(MR_SMC_TIME_WIN_1, &time1))
+ return -MR_ERR_SMC;
+
+ return 0;
+}
+
+
+int
+mr_set_perst(void * p)
+{
+ uint32_t perst;
+
+ /*
+ * Only values 0 and 1 allowed
+ */
+ perst = *(uint32_t *) p;
+ if (GET_BITS(31, 1, perst))
+ return -MR_ERR_RANGE;
+
+ if (mr_smc_wr(MR_SMC_PWR_LIM_PERS, &perst))
+ return -MR_ERR_SMC;
+
+ return 0;
+}
+
+
+#if USE_PM
+/*
+**
+** API functions dedicated for PM support
+**
+** These functions are embedded within the MT callout table
+** and thus needs to follow the calling convention, which
+** for 'get' functions is to pass an opague pointer to a buffer
+** to hold retrieved data and on return get a staus code (positive
+** on success, negative on failures) and for 'put' functions is
+** to pass an opague pointer to a buffer holding input data.
+**
+** Function list as per PM needs:
+**
+** pm_get_pl0 reads 0x2c, 0x2d and 0x2e
+** pm_set_pl0 writes 0x2c and 0x2d
+**
+** pm_get_pl1 reads 0x2f and 0x30
+** pm_set_pl1 writes 0x2f and 0x30
+**
+** pm_get_pavg reads 0x35 and 0x36
+**
+** pm_get_pttl reads 0x38 and 0x39
+**
+** pm_get_volt reads 0x3c, 0x3d and 0x3e
+**
+** pm_get_temp reads 0x40, 0x43, 0x44 and 0x45
+**
+** pm_get_tach reads 0x49 and 0x4a
+**
+** pm_get_tttl reads 0x4e and 0x4f
+**
+** pm_get_fttl reads 0x2b
+** pm_set_fttl writes 0x2b
+**
+*/
+
+#include "micpm_api.h"
+
+int
+pm_get_pl0(void * p)
+{
+ struct pm_rsp_plim * r;
+ uint32_t lim, win, grd;
+
+ lim = 0;
+ win = 0;
+ grd = 0;
+ mr_smc_rd(MR_SMC_PWR_LIM_0, &lim);
+ mr_smc_rd(MR_SMC_TIME_WIN_0, &win);
+ mr_smc_rd(MR_SMC_PWR_LIM0_GRD, &grd);
+
+ r = (struct pm_rsp_plim *) p;
+ r->pwr_lim = GET_BITS(15, 0, lim);
+ r->time_win = GET_BITS(15, 0, win);
+ r->guard_band = GET_BITS(15, 0, grd);
+
+ return sizeof(*r);
+}
+
+int
+pm_set_pl0(void * p)
+{
+ struct pm_cmd_plim * r;
+
+ /*
+ * Only lower 16 bit used
+ */
+ r = (struct pm_cmd_plim *) p;
+ if (GET_BITS(31, 16, r->pwr_lim))
+ return -MR_ERR_RANGE;
+ if (GET_BITS(31, 16, r->time_win))
+ return -MR_ERR_RANGE;
+
+ /*
+ * This does not allow caller to tell which failed.
+ *TBD: do we care?
+ */
+ if (mr_smc_wr(MR_SMC_PWR_LIM_0, &r->pwr_lim))
+ return -MR_ERR_SMC;
+ if (mr_smc_wr(MR_SMC_TIME_WIN_0, &r->time_win))
+ return -MR_ERR_SMC;
+
+ return 0;
+}
+
+int
+pm_get_pl1(void * p)
+{
+ struct pm_rsp_plim * r;
+ uint32_t lim, win;
+
+ lim = 0;
+ win = 0;
+ mr_smc_rd(MR_SMC_PWR_LIM_1, &lim);
+ mr_smc_rd(MR_SMC_TIME_WIN_1, &win);
+
+ r = (struct pm_rsp_plim *) p;
+ r->pwr_lim = GET_BITS(15, 0, lim);
+ r->time_win = GET_BITS(15, 0, win);
+ r->guard_band = 0;
+
+ return sizeof(*r);
+}
+
+int
+pm_set_pl1(void * p)
+{
+ struct pm_cmd_plim * r;
+
+ /*
+ * Only lower 16 bit used
+ */
+ r = (struct pm_cmd_plim *) p;
+ if (GET_BITS(31, 16, r->pwr_lim))
+ return -MR_ERR_RANGE;
+ if (GET_BITS(31, 16, r->time_win))
+ return -MR_ERR_RANGE;
+
+ /*
+ * This does not allow caller to tell which failed.
+ *TBD: do we care?
+ */
+ if (mr_smc_wr(MR_SMC_PWR_LIM_1, &r->pwr_lim))
+ return -MR_ERR_SMC;
+ if (mr_smc_wr(MR_SMC_TIME_WIN_1, &r->time_win))
+ return -MR_ERR_SMC;
+
+ return 0;
+}
+
+int
+pm_get_pavg(void * p)
+{
+ struct pm_rsp_pavg * r;
+ uint32_t pwr0, pwr1;
+
+ pwr0 = PUT_BITS(31, 30, 3);
+ pwr1 = PUT_BITS(31, 30, 3);
+ mr_smc_rd(MR_SMC_AVG_PWR_0, &pwr0);
+ mr_smc_rd(MR_SMC_AVG_PWR_1, &pwr1);
+
+ r = (struct pm_rsp_pavg *) p;
+ r->stat_0 = GET_BITS(31, 30, pwr0);
+ r->stat_1 = GET_BITS(31, 30, pwr1);
+ r->pwr_0 = GET_BITS(29, 0, pwr0);
+ r->pwr_1 = GET_BITS(29, 0, pwr1);
+
+ return sizeof(*r);
+}
+
+int
+pm_get_pttl(void * p)
+{
+ struct pm_rsp_pttl * r;
+ uint32_t dur, ttl;
+
+ if (mr_smc_rd(MR_SMC_PWR_TTL, &ttl))
+ return -MR_ERR_SMC;
+
+ r = (struct pm_rsp_pttl *) p;
+ r->pwr_ttl = GET_BIT(0, ttl);
+ dur = PUT_BITS(31, 30, 3);
+ if (r->pwr_ttl)
+ mr_smc_rd(MR_SMC_PWR_TTL_DUR, &dur);
+ r->stat_dur = GET_BITS(31, 30, dur);
+ r->duration = GET_BITS(15, 0, dur);
+
+ return sizeof(*r);
+}
+
+int
+pm_get_volt(void * p)
+{
+ struct pm_rsp_volt * r;
+ uint32_t vccp, vddg, vddq;
+
+ vccp = PUT_BITS(31, 30, 3);
+ vddg = PUT_BITS(31, 30, 3);
+ vddq = PUT_BITS(31, 30, 3);
+ mr_smc_rd(MR_SMC_VOLT_VCCP, &vccp);
+ mr_smc_rd(MR_SMC_VOLT_VDDG, &vddg);
+ mr_smc_rd(MR_SMC_VOLT_VDDQ, &vddq);
+
+ r = (struct pm_rsp_volt *) p;
+ r->stat_vccp = GET_BITS(31, 30, vccp);
+ r->stat_vddg = GET_BITS(31, 30, vddg);
+ r->stat_vddq = GET_BITS(31, 30, vddq);
+ r->vccp = GET_BITS(15, 0, vccp);
+ r->vddg = GET_BITS(15, 0, vddg);
+ r->vddq = GET_BITS(15, 0, vddq);
+
+ return sizeof(*r);
+}
+
+int
+pm_get_temp(void * p)
+{
+ struct pm_rsp_temp * r;
+ uint32_t cpu, vccp, vddg, vddq;
+
+ cpu = PUT_BITS(31, 30, 3);
+ vccp = PUT_BITS(31, 30, 3);
+ vddg = PUT_BITS(31, 30, 3);
+ vddq = PUT_BITS(31, 30, 3);
+ mr_smc_rd(MR_SMC_TEMP_CPU, &cpu);
+ mr_smc_rd(MR_SMC_TEMP_VCCP, &vccp);
+ mr_smc_rd(MR_SMC_TEMP_VDDG, &vddg);
+ mr_smc_rd(MR_SMC_TEMP_VDDQ, &vddq);
+
+ r = (struct pm_rsp_temp *) p;
+ r->stat_cpu = GET_BITS(31, 30, cpu);
+ r->stat_vccp = GET_BITS(31, 30, vccp);
+ r->stat_vddg = GET_BITS(31, 30, vddg);
+ r->stat_vddq = GET_BITS(31, 30, vddq);
+ r->cpu = GET_BITS(15, 0, cpu);
+ r->vccp = GET_BITS(15, 0, vccp);
+ r->vddg = GET_BITS(15, 0, vddg);
+ r->vddq = GET_BITS(15, 0, vddq);
+
+ return sizeof(*r);
+}
+
+int
+pm_get_tach(void * p)
+{
+ struct pm_rsp_tach * r;
+ uint32_t pwm, tach;
+
+ pwm = PUT_BITS(31, 30, 3);
+ tach = PUT_BITS(31, 30, 3);
+ mr_smc_rd(MR_SMC_FAN_PWM, &pwm);
+ mr_smc_rd(MR_SMC_FAN_TACH, &tach);
+
+ r = (struct pm_rsp_tach *) p;
+ r->stat_pwm = GET_BITS(31, 30, pwm);
+ r->stat_tach = GET_BITS(31, 30, tach);
+ r->fan_pwm = GET_BITS( 7, 0, pwm);
+ r->fan_tach = GET_BITS(15, 0, tach);
+
+ return sizeof(*r);
+}
+
+int
+pm_get_tttl(void * p)
+{
+ struct pm_rsp_tttl * r;
+ uint32_t dur, ttl;
+
+ if (mr_smc_rd(MR_SMC_TRM_TTL, &ttl))
+ return -MR_ERR_SMC;
+
+ r = (struct pm_rsp_tttl *) p;
+ r->thrm_ttl = GET_BIT(0, ttl);
+ dur = PUT_BITS(31, 30, 3);
+ if (r->thrm_ttl)
+ mr_smc_rd(MR_SMC_TRM_TTL_DUR, &dur);
+ r->stat_dur = GET_BITS(31, 30, dur);
+ r->duration = GET_BITS(15, 0, dur);
+
+ return sizeof(*r);
+}
+
+int
+pm_get_fttl(void * p)
+{
+ struct pm_rsp_fttl * r;
+ uint32_t ttl;
+
+ if (mr_smc_rd(MR_SMC_FORCE_TTL, &ttl))
+ return MR_ERR_SMC;
+
+ r = (struct pm_rsp_fttl *) p;
+ r->forced = GET_BIT(0, ttl);
+
+ return sizeof(*r);
+}
+
+int
+pm_set_fttl(void * p)
+{
+ uint32_t ttl;
+
+ /*
+ * Only values 0 and 1 allowed
+ */
+ ttl = ((struct pm_rsp_fttl *) p)->forced;
+ if (GET_BITS(31, 1, ttl))
+ return -MR_ERR_RANGE;
+
+ if (mr_smc_wr(MR_SMC_FORCE_TTL, &ttl))
+ return -MR_ERR_SMC;
+
+ return 0;
+}
+
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * RAS MT module driver
+ *
+ * Code and data structures to handle get/set tasks for KnF.
+ * Parties accessing the data structures are supposed to use the
+ * micras_mt_tsk() routines to ensure integrity and consistency.
+ * Particularly important when handling sysfs nodes and actions
+ * requested from SCIF connections must use that method in order
+ * to guarantee serialized access.
+ *
+ * Even if read-only access to latest valid data is required,
+ * it should go through micras_mt_tsk() using dedicated handlers
+ * in this module.
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/io.h>
+#include <linux/utsname.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/jiffies.h>
+#include <linux/kernel_stat.h>
+#include <linux/bitmap.h>
+#include <generated/compile.h>
+#include <generated/utsrelease.h>
+#include <mic/micbaseaddressdefine.h>
+#include <mic/micsboxdefine.h>
+#include "micras_api.h"
+#include "micmca_api.h"
+#include "micras.h"
+
+
+/*
+ * Persistent data accessible through the CP api.
+ * Some functions just read/modify hardware CSRs
+ * and thus need no storage between invocations.
+ */
+
+extern struct mr_rsp_vers vers;
+extern struct mr_rsp_volt volt;
+extern struct mr_rsp_freq freq;
+extern struct mr_rsp_power power;
+extern struct mr_rsp_plim plim;
+extern struct mr_rsp_gddr gddr;
+extern struct mr_rsp_gvolt gvolt;
+extern struct mr_rsp_gfreq gfreq;
+extern struct mr_rsp_temp temp;
+extern struct mr_rsp_ecc ecc;
+extern struct mr_rsp_trbo trbo;
+extern struct mr_rsp_pmcfg pmcfg;
+
+#if USE_FSC
+/*
+**
+** FSC API
+**
+** The FSC has a back-door communication channel, not documented
+** anywhere in the register spec nor in any HAS or LLD that is
+** available on recent KnF cards (later than rev ??).
+** Found a .35 proposal for it, so it better do. In short, this
+** backdoor relies on fan #2 is not used on KnF and the fact that
+** controls for fan #2 is transmitted over I2C to the fan speed
+** controller (FSC) unaltered, such that it can chose an alternate
+** interpretation of received data.
+**
+** The Fan Speed Override register (SBOX 0x8007d102c) has this
+** definition in the register spec:
+**
+** Bit(s) Usage
+** ------ ----------
+** 7:0 Fan 1 override ratio
+** 14 Fan 1 Set max speed
+** 15 Fan 1 Enable override
+** 23:16 Fan 2 override ratio
+** 30 Fan 2 Set max speed
+** 31 Fan 2 Enable override
+**
+** This register has been repurposed into a Message Gain Bit Bang Register
+** (MGBR) with a 4 bit command and a 16 bit data field, layout is:
+**
+** Bit(s) Usage
+** ------ ----------
+** 7:0 MGBR data 7:0
+** 21:14 MGBR data 15:8
+** 23:22 MGBR command 1:0
+** 31:30 MBGR command 3:2
+**
+** Command Usage
+** 0 Fan 1 Speed Override
+** 1 Power Management and Control Config
+** 7 PMC PCIe Alert Override
+** 8 PMC 2x3 Alert Override
+** 9 PMC 2x4 Alert Override
+** 10 Temperature Override Command
+** 11 General Status Command
+** 12-15 PID Gain Command(s)
+**
+** Fan 1 control works as MGBR command 0, though the spec is unclear on
+** whether the resulting FSO register format is same as the original spec.
+** Specifically, old spec has Fan 1 override enable in FSO bit 15, whereas
+** the MGBR spec has it in MGBR data bit 15 (corresponds to FSO bit 20).
+** Test shows it has to be MGBR bit 9, i.e. compatible with register spec.
+**
+** Fan #2 Status Register (SBOX 0x8007d1028) has been redefined into a
+** Message Gain Bit Bang Status (MGBSR) used to hold return data from
+** the MGBR General Status command in this layout:
+**
+** Bit(s) Usage
+** ------ ----------
+** 23:0 MGBSR data
+** 31:28 MGBR Gen. Sts. selector (bits 23:0 source).
+**
+** To get access to KnF telemetry data, only MGBR command 11 is needed.
+** Bits 7:0 of MGBR data for this command selects the sensor which FSC
+** will report to MGBSR (not sure if one-time or repeatedly). The actual
+** encoding is as follows:
+**
+** 0x00 Fan2Status
+** 0x01 PMC Configuration Command Settings
+** 0x07 Reads the 2x4 IR3275 Configuration Register
+** 0x08 Reads the 2x3 IR3275 Configuration Register
+** 0x09 Reads the PCIe IR3275 Configuration Register
+** 0x0A Reads the Temperature Command Settings
+** 0x20 Maximum Total Card Power - 1s Moving Average (20 Samples)
+** 0x21 Maximum 2x4 Connector Power - 1s Moving Average (20 Samples)
+** 0x22 Maximum 2x3 Connector Power - 1s Moving Average (20 Samples)
+** 0x23 Maximum PCIe Connector Power - 1s Moving Average (20 Samples)
+** 0x30 Maximum Total Card Power - Single Sample
+** 0x31 Maximum 2x4 Connector Power - Single Sample
+** 0x32 Maximum 2x3 Connector Power - Single Sample
+** 0x33 Maximum PCIe Connector Power - Single Sample
+** 0xA0 Returns the current Fan Tcontrol setting for the GPU temperature
+** 0xA1 Maximum Temperature for Temperature Sensor 1 - VCCP
+** 0xA2 Maximum Temperature for Temperature Sensor 2 - Air Inlet
+** 0xA3 Maximum Temperature for Temperature Sensor 3 - NW GDDR
+** 0xA4 Maximum Temperature for Temperature Sensor 4 - V1P5 VDD VR
+** 0xA5 Maximum Temperature for Temperature Sensor 5 - Display Transmitter
+** 0xA6 Maximum Temperature for GPU
+**
+** The 'return' values in MGBSR are 16 bit only, power in Watts, Temp in C.
+**
+** Implementation notes:
+** > The MGBR API is timing sensitive. FSC reads the MGBR register
+** at ~50 mSec intervals over an I2C bus and performs the command
+** on every read, which in case of the General Status command will
+** result in wrinting FSC internal data to the MGBSR register.
+** A delay is required after every write to MGBR in order to
+** ensure the FSC actually sees it.
+**
+** > I2C bus reads are 7 bytes, writes are 6 bytes, 1 clock at 100 kHz
+** is 10 uSec, 1 byte roughly translates to 10 bits, so minimum delay
+** on I2C from command written to return value is valid becomes
+** 10 * (6 + 7) * 10 uSec = 1.3 mSec
+** The I2C bus on KnF runs slower than 100 kHz, causing tranfers
+** to take more time than that to finish.
+** After the initial delay, we'll may need to wait on a result
+** to arrive in the MGBSR register.
+**
+** > It seems that fan 1 override is a dynamic act, i.e. for it to
+** be in effect the MBGR command needs to be set accordingly.
+** Therefore, when reading telemetry, the MGBR command is set
+** just for a period long enough for it to be seen by FSC and the
+** result to be latched into the MGBSR register. After that period
+** (when fan speed override is active) the MGBR is returned to
+** restore the fan 1 override.
+**
+*/
+
+#define MR_FSC_MGBR_OVR_CMD 0 /* Fan 1 Speed Override */
+#define MR_FSC_MGBR_GEN_CMD 11 /* General Status command */
+
+#define MR_FSC_STATUS 0x00 /* FSC Status & version */
+#define MR_FSC_PMC_CFG 0x01 /* PMC Configuration */
+
+#define MR_FSC_PWR_TOT 0x20 /* Total Power (1 sec avg) */
+#define MR_FSC_PWR_2X4 0x21 /* 2x4 Power (1 sec avg) */
+#define MR_FSC_PWR_2X3 0x22 /* 2x3 Power (1 sec avg) */
+#define MR_FSC_PWR_PCIE 0x23 /* PCIe Power (1 sec avg) */
+
+#define MR_FSC_PWR1_TOT 0x30 /* Total Power (single sample) */
+#define MR_FSC_PWR1_2X4 0x31 /* 2x4 Power (single sample) */
+#define MR_FSC_PWR1_2X3 0x32 /* 2x3 Power (single sample) */
+#define MR_FSC_PWR1_PCIE 0x33 /* PCIe Power (single sample) */
+
+#define MR_FSC_TEMP_VCCP 0xA1 /* VCCP VR Temperature */
+#define MR_FSC_TEMP_INLET 0xA2 /* Card Inlet Temperature */
+#define MR_FSC_TEMP_GDDR 0xA3 /* GDDR Temperature */
+#define MR_FSC_TEMP_VDD 0xA4 /* VDD VR Temperature */
+#define MR_FSC_TEMP_DISP 0xA5 /* Display Transmitter */
+
+
+/*
+ * Simple I/O access routines for FSC registers
+ */
+
+#ifdef MIC_IS_EMULATION
+/*
+ * Emulation does not handle I2C busses in general.
+ * Not sure if FSC is emulated, but won't rely on it.
+ * The following stubs are for emulation only.
+ */
+
+int
+fsc_mgbr_read(uint32_t * v)
+{
+ if (v)
+ memset(v, 0, 4);
+
+ return 0;
+}
+
+void
+fsc_mgbr_write(uint8_t c, uint32_t v)
+{
+}
+
+#else
+
+#if 0
+#define RL printk("%s: %2x -> %08x\n", __FUNCTION__, mgbr_cmd, *val)
+#define WL printk("%s: %2x <- %08x\n", __FUNCTION__, mgbr_cmd, *val)
+#else
+#define RL /* As nothing */
+#define WL /* As nothing */
+#endif
+
+static uint8_t mgbr_cmd; /* Last MGBR command */
+static uint32_t mgbr_dat; /* Last MGBR data */
+static uint32_t fan1_ovr; /* Current fan 1 override command */
+
+/*
+ * Read MGBSR from SBOX
+ *
+ * This function only support MGBR commands MR_FSC_MGBR_{OVR|GEN}_CMD.
+ * The operation mode is that the command is written to MGBR and after
+ * a while the response shows up in MGBSR, which has fields that tell
+ * which command caused the response (bits 31:28), and for GEN command
+ * also which sensor was read. This function checks both fields.
+ *
+ * We'll poll at 1 mSec rate and allow up to 200 mSec for the
+ * FSC to provide the measure in the SBOX register.
+ */
+
+int
+fsc_mgbsr_read(uint32_t * val)
+{
+ uint32_t mgbsr;
+ int n;
+
+ for(n = 0; n < 200; n++) {
+ mgbsr = mr_sbox_rl(0, SBOX_STATUS_FAN2);
+ if ((GET_BITS(31, 28, mgbsr) == mgbr_cmd) ||
+ mgbr_cmd != MR_FSC_MGBR_GEN_CMD || mgbr_dat == 0) {
+ if (mgbr_cmd != MR_FSC_MGBR_GEN_CMD ||
+ mgbr_dat <= 1) {
+ *val = GET_BITS(23, 0, mgbsr);
+ RL;
+ return 0;
+ }
+ if (GET_BITS(23, 16, mgbsr) == mgbr_dat) {
+ *val = GET_BITS(15, 0, mgbsr);
+ RL;
+ return 0;
+ }
+ }
+ myDELAY(1000);
+ }
+
+ /*
+ * Timeout
+ */
+ return 1;
+}
+
+
+/*
+ * Write MGBR on SBOX
+ *
+ * This function only support MGBR commands MR_FSC_MGBR_{OVR|GEN}_CMD.
+ * The OVR command only when fan 1 speed override is active.
+ * The GEN command is meant to cause a new selectable telemetry to be
+ * pushed into the MBGSR register by the FSC. Any necessary delays
+ * are handled here. Not by the read function.
+ */
+
+void
+fsc_mgbr_write(uint8_t c, uint32_t * val)
+{
+ uint32_t prev_cmd, prev_dat;
+ uint32_t mgbr_reg, mgbr_sel;
+ uint32_t mgbsr, n;
+
+ prev_cmd = mgbr_cmd;
+ prev_dat = mgbr_dat;
+ mgbr_cmd = GET_BITS(3, 0, c);
+ mgbr_dat = GET_BITS(15, 0, *val);
+
+ mgbr_reg = PUT_BITS(31, 30, (mgbr_cmd >> 2)) |
+ PUT_BITS(23, 22, mgbr_cmd) |
+ PUT_BITS(21, 14, (mgbr_dat >> 8)) |
+ PUT_BITS( 7, 0, mgbr_dat);
+ WL;
+ mr_sbox_wl(0, SBOX_SPEED_OVERRIDE_FAN, mgbr_reg);
+
+ /*
+ * Special for Set Fan Speed, we keep track of that one
+ */
+ if (mgbr_cmd == MR_FSC_MGBR_OVR_CMD) {
+ if (GET_BIT(9, mgbr_dat))
+ fan1_ovr = GET_BITS(9, 0, mgbr_dat);
+ else
+ fan1_ovr = 0;
+ }
+
+ /*
+ * If the command issued is the same as the previous command,
+ * there is no way to determine if the MGBSR register is result
+ * of this or the previous command. It is not possible to clear
+ * MGBSR (read-only register), so if it is the same register,
+ * we'll just have to wait long enough for FSC to respond.
+ * Not all MGBR commands are mirrored into top 4 bits of MGBSR,
+ * those gets the simple delay treatment.
+ */
+ if ((mgbr_cmd == prev_cmd && mgbr_dat == prev_dat) ||
+ mgbr_cmd != MR_FSC_MGBR_GEN_CMD || mgbr_dat <= 1) {
+ myDELAY(100 * 1000);
+ return;
+ }
+ mgbr_sel = GET_BITS(7, 0, mgbr_dat);
+ for(n = 0; n < 200; n++) {
+ mgbsr = mr_sbox_rl(0, SBOX_STATUS_FAN2);
+ if (GET_BITS(31, 28, mgbsr) == mgbr_cmd) {
+ if (mgbr_cmd != MR_FSC_MGBR_GEN_CMD)
+ return;
+ if (GET_BITS(23, 16, mgbsr) == mgbr_sel)
+ return;
+ }
+ myDELAY(1000);
+ }
+}
+#undef RL
+#undef WL
+#endif /* EMULATION */
+
+
+/*
+ * Bypass for FSC access.
+ * Somewhat bizarre backdoor to the FSC's MGBR and MGBSR registers.
+ * The FSC interface is asymmetrical by nature since only the General
+ * Status MGBR command can cause data to be returned through MGBSR.
+ * To make it appear as telemetry registers can be read directly
+ * and without need for privileges, the Read operation is rigged to
+ * issue the appropriate MGBR registers itself when necessary.
+ *
+ * To protect the FSC integrity, the SET command are restricted
+ * to privileged users and is only accepting commands that cannot
+ * harm the FSC integrity. For now the whitelist consists of
+ * 0 Fan 1 Speed Override
+ * 1 Power Management and Control Config
+ * 11 General Status command
+ *
+ * To read back the response from a SET command the exact same value
+ * of 'parm' must be passed to a subsequent GET, in which case the
+ * the GET routine will not insert it's own MGBR command to select
+ * contents of the MGBSR to return.
+ *
+ * Notice that FSC read is equivalent of reading Fan #2 Status register
+ * and FSC write is equivalent of writing Fan Speed Override register.
+ *
+ * This reuse the SMC interface structs, but the semantics are different.
+ *
+ * Return:
+ * r->reg MGBSR sensor select (if applicable) or 0
+ * r->width always 3 (24 bit wide field)
+ * r->rtn.val MGBSR sensor data
+ *
+ * Input:
+ * parm 31:24 MGBR command (must be 0xb)
+ * parm 15:0 MGBR data (sensor select)
+ */
+
+int
+mr_get_fsc(void * p)
+{
+ int rtn;
+ uint32_t raw;
+ struct mr_rsp_smc * r;
+ uint8_t cmd;
+ uint32_t dat, parm;
+
+ /*
+ * Extract MGBR command and dat
+ */
+ parm = * (uint32_t *) p;
+ cmd = GET_BITS(31, 24, parm);
+ dat = GET_BITS(15, 0, parm);
+
+ /*
+ * If the request is different from the last issued
+ * 'SET' command in any way then 'GET' will issue the
+ * corresponding MGBR command, if allowed.
+ */
+ if (mgbr_cmd != cmd || mgbr_dat != dat) {
+ /*
+ * Only allow 'General Status' command
+ */
+ if (cmd != MR_FSC_MGBR_GEN_CMD)
+ return -MR_ERR_PERM;
+
+ /*
+ * Screen against known FSC register widths.
+ * All commands seems to be 16 bit wide.
+ * We insist that unused upper bits are zeros.
+ */
+ if (dat != GET_BITS(23, 0, parm))
+ return -MR_ERR_INVAUX;
+
+ /*
+ * Better way to single out these numbers?
+ * 0 1 20 21 22 23 30 31 32 33 a1 a2 a3 a4 a5
+ */
+ if (! ((dat <= 1) ||
+ (dat >= 0x20 && dat <= 0x23) ||
+ (dat >= 0x30 && dat <= 0x33) ||
+ (dat >= 0xa1 && dat <= 0xa5)))
+ return -MR_ERR_PERM;
+
+ /*
+ * Write MGBR command
+ */
+ fsc_mgbr_write(cmd, &dat);
+ }
+
+ /*
+ * Read MGBSR result
+ */
+ rtn = fsc_mgbsr_read(&raw);
+ if (rtn)
+ return -MR_ERR_SMC;
+
+ /*
+ * Revert to normal if fan 1 speed override mode if needed.
+ */
+ if (fan1_ovr)
+ fsc_mgbr_write(MR_FSC_MGBR_OVR_CMD, &fan1_ovr);
+
+ r = (struct mr_rsp_smc *) p;
+ if (cmd == MR_FSC_MGBR_GEN_CMD)
+ r->reg = GET_BITS(7, 0, dat);
+ r->width = 3;
+ r->rtn.val = GET_BITS(23, 0, raw);
+
+ return sizeof(*r);
+}
+
+
+int
+mr_set_fsc(void * p)
+{
+ uint8_t cmd;
+ uint32_t dat, parm;
+
+ parm = * (uint32_t *) p;
+ cmd = GET_BITS(31, 24, parm);
+ dat = GET_BITS(15, 0, parm);
+
+ /*
+ * Screen against known FSC register widths.
+ * All commands seems to be 16 bit wide.
+ * We insist that unused upper bits are zeros.
+ */
+ if (dat != GET_BITS(23, 0, parm))
+ return -MR_ERR_INVAUX;
+
+ /*
+ * 4-bit command code for FSC.
+ * Mask of valid codes needs just 16 bits.
+ * Max valid codes 0..1, 7..15, mask 0xff83.
+ * Non-debug registers reduce mask to 0x0803.
+ */
+ if (! ((1 << cmd) & 0x0803))
+ return -MR_ERR_PERM;
+
+ /*
+ * Write MGBR command and revert to fan 1 speed override mode
+ * if needed (override in effect). Side effect of reverting
+ * is that any reponse in MGBSR must to be read before next
+ * FSC sample happens, i.e. within 50 mSec.
+ */
+ fsc_mgbr_write(cmd, &dat);
+ if (fan1_ovr)
+ fsc_mgbr_write(MR_FSC_MGBR_OVR_CMD, &fan1_ovr);
+
+ return 0;
+}
+#endif
+
+
+/*
+**
+** Conversion between CP formats (uV, MHz, etc.)
+** and hardware register formats (SBOX mostly).
+**
+*/
+
+
+/*
+ * VRM11 voltage converters
+ * Only bits 6:1 are being used as follows:
+ * Volt = Max - Res * (Bits -1)
+ * Bits = 1 + (Max - Volt) / Res
+ * The delta divided by resolution is 62.
+ * Bits value of 0 reserved for turning VR off.
+ */
+
+#define VRM11_MAX 1600000 /* 1.60 V */
+#define VRM11_MIN 825000 /* 825 mV */
+#define VRM11_RES 12500 /* 12.5 mV */
+
+uint32_t
+vid2volt(uint8_t vid)
+{
+ uint32_t bits;
+
+ bits = GET_BITS(6, 1, vid);
+ if (bits)
+ return VRM11_MAX - VRM11_RES * (bits - 1);
+ else
+ return 0;
+}
+
+uint8_t
+volt2vid(uint32_t uv)
+{
+ uint32_t delta, bits;
+
+ bits = 0;
+ if (uv >= VRM11_MIN && uv <= VRM11_MAX) {
+ delta = VRM11_MAX - uv;
+ /*
+ * Why bother check for accurate input?
+ * Ignoring it just rounds up to nearest!
+ */
+ if (! (delta % VRM11_RES))
+ bits = 1 + delta / VRM11_RES;
+ }
+ return PUT_BITS(6, 1, bits);
+}
+
+
+/*
+ * PLL tables used to map between hw scale register
+ * value and actual frequencies given a fixed base.
+ * The formula is (probably KnF specific)
+ * freq = Base * Feedback / Feedforward
+ * where
+ * Base = 100 MHz
+ * FeedBack = ratio bits 5:0
+ * FeedForward = ratio bits 7:6 (00 -> 8, 01 -> 4, 10 -> 2, 11 -> 1)
+ *
+ * Overlapping ranges over feedback and feedforward values are
+ * handled by range table(s) below such that lower frequencies
+ * can be selected at a finer granularity.
+ */
+
+struct pll_tab {
+ uint8_t clk_div; /* Feed forward */
+ uint8_t min_mul; /* Lower feedback */
+ uint8_t max_mul; /* Upper feedback */
+ uint16_t min_clk; /* Lower frequency */
+ uint16_t max_clk; /* Upper frequency */
+ uint8_t step_size; /* Granularity */
+} cpu_tab[] = { /* CPU PLL */
+ { 1, 20, 40, 2000, 4000, 100},
+ { 2, 20, 39, 1000, 1950, 50},
+ { 4, 20, 39, 500, 975, 25},
+}, gddr_tab[] = { /* GDDR PLL */
+ {1, 14, 30, 1400, 3000, 100},
+ {2, 12, 27, 600, 1350, 50},
+};
+
+#define B_CLK 100 /* Base clock (MHz) */
+
+static uint16_t
+ratio2freq(uint8_t ratio, struct pll_tab * tab, int tablen)
+{
+ uint16_t fwd, bck;
+
+ fwd = GET_BITS(7, 6, ~ratio);
+ bck = GET_BITS(5, 0, ratio);
+
+ if (fwd < tablen && bck >= tab[fwd].min_mul && bck <= tab[fwd].max_mul)
+ return (B_CLK * bck) / tab[fwd].clk_div;
+
+ return 0;
+}
+
+static uint8_t
+freq2ratio(uint16_t freq, struct pll_tab * tab, int tablen)
+{
+ int fwd;
+
+ for(fwd = tablen - 1; fwd >= 0; fwd--) {
+ if (freq >= tab[fwd].min_clk && freq <= tab[fwd].max_clk) {
+ /*
+ * Why bother check for accurate input?
+ * Ignoring just rounds down to nearest supported!
+ */
+ if (freq % tab[fwd].step_size)
+ break;
+
+ return PUT_BITS(7, 6, ~fwd) |
+ PUT_BITS(5, 0, (freq * tab[fwd].clk_div) / B_CLK);
+ }
+ }
+
+ return 0;
+}
+
+static uint32_t
+mr_mt_gf_r2f(uint8_t pll)
+{
+ return 1000 * ratio2freq(pll, gddr_tab, ARRAY_SIZE(gddr_tab));
+}
+
+static uint32_t
+mr_mt_cf_r2f(uint8_t pll)
+{
+ return 1000 * ratio2freq(pll, cpu_tab, ARRAY_SIZE(cpu_tab));
+}
+
+
+/*
+ * Board voltage sense converter
+ * Two 10 bit read-outs from SBOX register 0x1038.
+ * The format is very poorly documented, so no
+ * warranty on this conversion. Assumption is
+ * the reading is a binary fixed point number.
+ * bit 15 Valid reading if set
+ * bit 9:8 2 bit integer part
+ * bit 7:0 8 bit fraction part
+ * Return value is 0 (invalid) or voltage i uV.
+ */
+
+uint32_t
+bvs2volt(uint16_t sense)
+{
+ uint32_t res, f, msk;
+
+ if (! GET_BIT(15, sense))
+ return 0;
+
+ /*
+ * First get integer contribution
+ * Then accumulate fraction contributions.
+ * Divide and add fraction if corresponding bit set.
+ */
+ res = 1000000 * GET_BITS(9, 8, sense);
+ for(msk = (1 << 7), f = 1000000/2; msk && f; msk >>= 1, f >>= 1)
+ if (sense & msk)
+ res += f;
+
+ return res;
+}
+
+
+
+/*
+**
+** Initializations
+**
+** This has two intended purposes:
+** - Do a on-time effort to collect info on properties that
+** are not going to change after the initial setup by
+** either bootstrap or kernel initialization.
+** - Collect initial values on things we can modify.
+** Intent is that unloading the ras module should reset
+** all state to that of the time the module was loaded.
+**
+*/
+
+static void __init
+mr_mk_cf_lst(void)
+{
+ int i, n;
+ uint16_t f;
+
+ n = 0;
+ for(i = ARRAY_SIZE(cpu_tab) -1; i >= 0; i--) {
+ for(f = cpu_tab[i].min_clk;
+ f <= cpu_tab[i].max_clk;
+ f += cpu_tab[i].step_size) {
+ freq.supt[n] = 1000 * f;
+ freq.slen = ++n;
+ if (n >= MR_PTAB_LEN)
+ return;
+ }
+ }
+}
+
+static void __init
+mr_mk_gf_lst(void)
+{
+ int i, n;
+ uint16_t f;
+
+ n = 0;
+ for(i = ARRAY_SIZE(gddr_tab) -1; i >= 0; i--) {
+ for(f = gddr_tab[i].min_clk;
+ f <= gddr_tab[i].max_clk;
+ f += gddr_tab[i].step_size) {
+ gfreq.supt[n] = 1000 * f;
+ gfreq.slen = ++n;
+ if (n == MR_PTAB_LEN)
+ return;
+ }
+ }
+}
+
+static void __init
+mr_mk_cv_lst(void)
+{
+ int n;
+ uint32_t cv;
+
+ n = 0;
+ for(cv = VRM11_MIN; cv <= VRM11_MAX; cv += VRM11_RES) {
+ volt.supt[n] = cv;
+ volt.slen = ++n;
+ if (n >= MR_PTAB_LEN)
+ return;
+ }
+}
+
+
+void __init
+mr_mt_card_init(void)
+{
+ uint8_t * boot, * stage2, * parm;
+ uint32_t scr7, scr9, fsc;
+ uint32_t cv, cf, gv;
+ int i, j;
+
+ /*
+ * VERS:
+ * Map flash and scan for version strings.
+ * Different methods for KnF and KnC.
+ */
+ boot = ioremap(MIC_SPI_BOOTLOADER_BASE, MIC_SPI_BOOTLOADER_SIZE);
+ stage2 = ioremap(MIC_SPI_2ND_STAGE_BASE, MIC_SPI_2ND_STAGE_SIZE);
+ parm = ioremap(MIC_SPI_PARAMETER_BASE, MIC_SPI_PARAMETER_SIZE);
+ if (!boot || !stage2 || !parm) {
+ printk("mr_mt_init: ioremap failure: boot %p, stage2 %p, par %p\n",
+ boot, stage2, parm);
+ goto fail_iomap;
+ }
+
+ /*
+ * Build numbers for fboot0 and fboot 1 repectively
+ */
+ scr7 = mr_sbox_rl(0, SBOX_SCRATCH7);
+
+ /*
+ * Boot block scan:
+ * Scan for string 'fboot0 version:' or use a 16 bit offset af offset 0xfff8.
+ * The latter points directly to the numeral, not to the string mentioned.
+ */
+ for(i = 0; i < MIC_SPI_BOOTLOADER_SIZE - 32; i++) {
+ if (boot[i] != 'f')
+ continue;
+
+ if (! memcmp(boot + i, "fboot0 version:", 15)) {
+ vers.fboot0[0] = scnprintf(vers.fboot0 + 1, MR_VERS_LEN -2,
+ "%s (build %d)", boot + i, GET_BITS(15, 0, scr7));
+ break;
+ }
+ }
+
+ /*
+ * Stage 2 scan:
+ * Scan for the magic string that locates the bootstrap version. This
+ * area is formatted as '<txt> (<\0>, <vers>)', so the string we are
+ * looking for is 23 bytes later.
+ */
+ for(i = 0; i < MIC_SPI_2ND_STAGE_SIZE - 32; i++) {
+ if (stage2[i] != 'L')
+ continue;
+
+ if (! memcmp(stage2 + i, "Larrabee bootstrap", 18)) {
+ vers.fboot1[0] = scnprintf(vers.fboot1 + 1, MR_VERS_LEN -2,
+ "fboot1 version: %s", stage2 + i + 23);
+ vers.fboot1[0] = scnprintf(vers.fboot1 + vers.fboot1[0], MR_VERS_LEN -2,
+ " (build %d)", GET_BITS(31, 16, scr7));
+ break;
+ }
+ }
+
+ /*
+ * Parameter block scan:
+ * On 4 byte aligned locations, look for chars 'EOB_'.
+ * Numerical values for that string is 0x5f424f45.
+ */
+ for(i = j = 0; i < MIC_SPI_PARAMETER_SIZE; i += sizeof(uint32_t))
+ if (*(uint32_t *)(parm + i) == 0x5f424f45) {
+ vers.flash[j][0] = scnprintf(vers.flash[j] + 1, MR_VERS_LEN -2,
+ "flash %c%c%c%c version: %s",
+ parm[i+4], parm[i+5], parm[i+6], parm[i+7], parm + i + 32);
+ if (++j >= ARRAY_SIZE(vers.flash))
+ break;
+ }
+
+fail_iomap:
+ if (boot)
+ iounmap(boot);
+ if (stage2)
+ iounmap(stage2);
+ if (parm)
+ iounmap(parm);
+
+#if USE_FSC
+ /*
+ * Reset SMC registers to default (MGBR cmd 0, data 0).
+ */
+ mr_sbox_wl(0, SBOX_SPEED_OVERRIDE_FAN, 0);
+
+ /*
+ * The MGBR Status has this layout for (MGBR command 0).
+ * 7:0 Firmware version
+ * 10:8 Card straps
+ * 11 Fan disable
+ * 20:12 Temperatur sensor 5
+ * 27:21 Reserved
+ * 31:28 Command (0)
+ */
+#else
+ /*
+ * Contrary to register spec, the fan speed controller
+ * 2 status register has been redefined to hold version
+ * information of the FSC firmware.
+ * 7:0 Revision
+ * 10:8 FSC straps
+ * 11 Fan disable
+ * 19:12 Temperatur sensor 5
+ * 27:20 Reserved
+ * 28 BIOS clear
+ * 31:29 Reserved
+ * This is probably an early version of the MGBR hack.
+ */
+#endif
+
+ /*
+ * Retrieve FSC version and strap config
+ */
+ fsc = mr_sbox_rl(0, SBOX_STATUS_FAN2);
+ vers.fsc[0] = scnprintf(vers.fsc + 1, MR_VERS_LEN -2,
+ "FSC firmware revision: %02x, straps %x",
+ GET_BITS(7, 0, fsc), GET_BITS(10, 8, fsc));
+
+ /*
+ * VOLT:
+ * Report all voltages the hardware can set.
+ */
+ cv = mr_sbox_rl(0, SBOX_COREVOLT);
+ volt.set = vid2volt(GET_BITS(7, 0, cv));
+ mr_mk_cv_lst();
+
+ /*
+ * FREQ:
+ * In FreeBSD uOS the reference (nominal) frequency
+ * is simply the value read from the SBOX at boot time.
+ * We'll do the same and set 'def' to the same as 'current'.
+ * Report all voltages the hardware can set.
+ */
+ cf = mr_sbox_rl(0, SBOX_COREFREQ);
+ freq.def = mr_mt_cf_r2f(GET_BITS(7, 0, cf));
+ mr_mk_cf_lst();
+
+ /*
+ * GDDR:
+ * See layout of scratch #9 in 'common'.
+ * 23:16 Clock ratio encoding
+ * 28:24 External clock frequency
+ */
+ scr9 = mr_sbox_rl(0, SBOX_SCRATCH9);
+ gddr.speed = 2 * mr_mt_gf_r2f(GET_BITS(23, 16, scr9));
+
+ /*
+ * GVOLT:
+ * Report all voltages the hardware can set.
+ * Kind of silly as these cannot be changed from uOS.
+ * Cheat and set 'def' to the same as 'current'.
+ */
+ gv = mr_sbox_rl(0, SBOX_MEMVOLT);
+ gvolt.set = vid2volt(GET_BITS(7, 0, gv));
+
+ /*
+ * GFREQ:
+ * Report all values the hardware can set.
+ * Kind of silly as these cannot be changed from uOS.
+ * Cheat and set 'ref' to the same as 'current'.
+ */
+ gfreq.def = mr_mt_gf_r2f(GET_BITS(23, 16, scr9));
+ mr_mk_gf_lst();
+
+ /*
+ * POWER:
+ * If case FSC not working or if not compiled in,
+ * preset all power readings as invalid.
+ */
+ {
+ struct mr_rsp_power tmp = {{0, 3}, {0, 3}, {0, 3},
+ {0, 3}, {0, 3}, {0, 3}, {0, 3},
+ {0, 0, 0, 3, 3, 3},
+ {0, 0, 0, 3, 3, 3},
+ {0, 0, 0, 3, 3, 3}};
+ power = tmp;
+ }
+
+ /*
+ *TBD: Save card registers this module may change
+ */
+}
+
+void __exit
+mr_mt_card_exit(void)
+{
+ /*
+ *TBD: Restore card registers this module may change
+ */
+}
+
+
+
+/*
+**
+** Card specific 'Get' functions
+**
+*/
+
+int
+mr_get_volt(void * p)
+{
+ struct mr_rsp_volt * r;
+ uint32_t cv, fsc;
+
+
+ cv = mr_sbox_rl(0, SBOX_COREVOLT);
+ volt.set = vid2volt(GET_BITS(7, 0, cv));
+
+ fsc = mr_sbox_rl(0, SBOX_BOARD_VOLTAGE_SENSE);
+ volt.cur = bvs2volt(GET_BITS(15, 0, fsc));
+
+ r = (struct mr_rsp_volt *) p;
+ *r = volt;
+ return sizeof(*r);
+}
+
+int
+mr_get_freq(void * p)
+{
+ struct mr_rsp_freq * r;
+ uint32_t cf;
+
+ cf = mr_sbox_rl(0, SBOX_COREFREQ);
+ freq.cur = mr_mt_cf_r2f(GET_BITS(7, 0, cf));
+
+ r = (struct mr_rsp_freq *) p;
+ *r = freq;
+ return sizeof(*r);
+}
+
+#if USE_FSC
+/*
+ * Get Power stats from the FSC
+ */
+static void
+get_fsc_pwr(uint32_t req, struct mr_rsp_pws * pws)
+{
+ uint32_t fsc;
+
+ /*
+ * Read the FSC status
+ */
+ fsc_mgbr_write(MR_FSC_MGBR_GEN_CMD, &req);
+ if (fsc_mgbsr_read(&fsc))
+ pws->p_val = 3;
+ else {
+ pws->p_val = 0;
+ pws->prr = 1000000 * GET_BITS(15, 0, fsc);
+ }
+}
+#endif
+
+int
+mr_get_power(void * p)
+{
+ struct mr_rsp_power * r;
+
+#if USE_FSC
+ uint8_t prev_cmd;
+ uint32_t prev_dat;
+
+ /*
+ * Backup current OVERRIDE register
+ */
+ prev_cmd = mgbr_cmd;
+ prev_dat = mgbr_dat;
+
+ /*
+ * Get Power stats from the FSC
+ */
+ get_fsc_pwr(MR_FSC_PWR_TOT, &power.tot0);
+ get_fsc_pwr(MR_FSC_PWR1_TOT, &power.inst);
+ get_fsc_pwr(MR_FSC_PWR_PCIE, &power.pcie);
+ get_fsc_pwr(MR_FSC_PWR_2X3, &power.c2x3);
+ get_fsc_pwr(MR_FSC_PWR_2X4, &power.c2x4);
+
+ /*
+ * Revert to normal or fan 1 speed override mode if needed.
+ */
+ if (fan1_ovr)
+ fsc_mgbr_write(MR_FSC_MGBR_OVR_CMD, &fan1_ovr);
+ else
+ fsc_mgbr_write(prev_cmd, &prev_dat);
+#endif
+
+ r = (struct mr_rsp_power *) p;
+ *r = power;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_plim(void * p)
+{
+ struct mr_rsp_plim * r;
+
+#if USE_FSC
+ uint32_t fsc, req, ofs;
+
+ /*
+ * Read the FSC status
+ */
+ req = MR_FSC_PMC_CFG;
+ fsc_mgbr_write(MR_FSC_MGBR_GEN_CMD, &req);
+ if (! fsc_mgbsr_read(&fsc)) {
+ ofs = 5 * GET_BITS(3, 0, fsc);
+ if (GET_BIT(4, fsc))
+ plim.phys = 300 - ofs;
+ else
+ plim.phys = 300 + ofs;
+ plim.hmrk = plim.lmrk = plim.phys;
+ }
+#endif
+
+ r = (struct mr_rsp_plim *) p;
+ *r = plim;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_gfreq(void * p)
+{
+ struct mr_rsp_gfreq * r;
+ uint32_t gbr;
+
+ gbr = mr_sbox_rl(0, SBOX_MEMORYFREQ);
+ gfreq.cur = mr_mt_gf_r2f(GET_BITS(7, 0, gbr));
+
+ r = (struct mr_rsp_gfreq *) p;
+ *r = gfreq;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_gvolt(void * p)
+{
+ struct mr_rsp_gvolt * r;
+ uint32_t gv, fsc;
+
+ gv = mr_sbox_rl(0, SBOX_MEMVOLT);
+ gvolt.set = vid2volt(GET_BITS(7, 0, gv));
+
+ fsc = mr_sbox_rl(0, SBOX_BOARD_VOLTAGE_SENSE);
+ gvolt.cur = bvs2volt(GET_BITS(31, 16, fsc));
+
+ r = (struct mr_rsp_gvolt *) p;
+ *r = gvolt;
+ return sizeof(*r);
+}
+
+int
+mr_get_temp(void * p)
+{
+ struct mr_rsp_temp * r;
+ uint32_t btr1, btr2; /* Board temps */
+ uint32_t die1, die2, die3; /* Die temps */
+ uint32_t dmx1, dmx2, dmx3; /* Max die temps */
+ uint32_t tsta, fsc; /* Thermal status */
+
+ btr1 = mr_sbox_rl(0, SBOX_BOARD_TEMP1);
+ btr2 = mr_sbox_rl(0, SBOX_BOARD_TEMP2);
+ die1 = mr_sbox_rl(0, SBOX_CURRENT_DIE_TEMP0);
+ die2 = mr_sbox_rl(0, SBOX_CURRENT_DIE_TEMP1);
+ die3 = mr_sbox_rl(0, SBOX_CURRENT_DIE_TEMP2);
+ dmx1 = mr_sbox_rl(0, SBOX_MAX_DIE_TEMP0);
+ dmx2 = mr_sbox_rl(0, SBOX_MAX_DIE_TEMP1);
+ dmx3 = mr_sbox_rl(0, SBOX_MAX_DIE_TEMP2);
+ tsta = mr_sbox_rl(0, SBOX_THERMAL_STATUS);
+ fsc = mr_sbox_rl(0, SBOX_STATUS_FAN2);
+
+ /*
+ * Board temperatures.
+ * No idea of where on the board they are located, but
+ * guessing from FreeBSD comments they are:
+ * 0 Air Inlet
+ * 1 VCCP VR
+ * 2 GDDR (not sure which chip)
+ * 3 GDDR VR
+ * The temperature read from FSC #2 seems valid, but
+ * there's no mention of where it's measured.
+ * The readings does not make much sense.
+ * Sample readings are like this:
+ * fin 32
+ * vccp 28 (vccp VR)
+ * vddq 33 (gddr VR)
+ * vddg 28 (FSC 2)
+ * So, at least 'fin' is wrong (or fan in reverse).
+ */
+ temp.fin.cur = (btr1 & (1 << 15)) ? GET_BITS( 8, 0, btr1) : 0;
+ temp.vccp.cur = (btr1 & (1 << 31)) ? GET_BITS(24, 16, btr1) : 0;
+ temp.gddr.cur = (btr2 & (1 << 15)) ? GET_BITS( 8, 0, btr2) : 0;
+ temp.vddq.cur = (btr2 & (1 << 31)) ? GET_BITS(24, 16, btr2) : 0;
+ temp.vddg.cur = GET_BITS(19, 12, fsc);
+ temp.brd.cur = 0;
+ if (temp.fin.cur > temp.brd.cur)
+ temp.brd.cur = temp.fin.cur;
+ if (temp.vccp.cur > temp.brd.cur)
+ temp.brd.cur = temp.vccp.cur;
+ if (temp.gddr.cur > temp.brd.cur)
+ temp.brd.cur = temp.gddr.cur;
+ if (temp.vddq.cur > temp.brd.cur)
+ temp.brd.cur = temp.vddq.cur;
+ temp.fout.c_val = 3;
+ temp.gddr.c_val = 3;
+
+ /*
+ * Die temperatures.
+ */
+ temp.die.cur = (tsta & (1 << 31)) ? GET_BITS(30, 22, tsta) : 0;
+ temp.dies[0].cur = GET_BITS( 8, 0, die1);
+ temp.dies[1].cur = GET_BITS(17, 9, die1);
+ temp.dies[2].cur = GET_BITS(26, 18, die1);
+ temp.dies[3].cur = GET_BITS( 8, 0, die2);
+ temp.dies[4].cur = GET_BITS(17, 9, die2);
+ temp.dies[5].cur = GET_BITS(26, 18, die2);
+ temp.dies[6].cur = GET_BITS( 8, 0, die3);
+ temp.dies[7].cur = GET_BITS(17, 9, die3);
+ temp.dies[8].cur = GET_BITS(26, 18, die3);
+
+ /*
+ * Die max temp (min is not reported to CP).
+ */
+ temp.dies[0].max = GET_BITS( 8, 0, dmx1);
+ temp.dies[1].max = GET_BITS(17, 9, dmx1);
+ temp.dies[2].max = GET_BITS(26, 18, dmx1);
+ temp.dies[3].max = GET_BITS( 8, 0, dmx2);
+ temp.dies[4].max = GET_BITS(17, 9, dmx2);
+ temp.dies[5].max = GET_BITS(26, 18, dmx2);
+ temp.dies[6].max = GET_BITS( 8, 0, dmx3);
+ temp.dies[7].max = GET_BITS(17, 9, dmx3);
+ temp.dies[8].max = GET_BITS(26, 18, dmx3);
+
+ r = (struct mr_rsp_temp *) p;
+ *r = temp;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_fan(void * p)
+{
+ struct mr_rsp_fan * r;
+ uint32_t fan1, fovr;
+
+ r = (struct mr_rsp_fan *) p;
+ fan1 = mr_sbox_rl(0, SBOX_STATUS_FAN1);
+
+#if USE_FSC
+ fovr = fan1_ovr;
+ r->override = GET_BIT(9, fovr);
+#else
+ fovr = mr_sbox_rl(0, SBOX_SPEED_OVERRIDE_FAN);
+ r->override = GET_BIT(15, fovr);
+#endif
+
+ r->rpm = GET_BITS(15, 0, fan1);
+ if (r->override)
+ r->pwm = GET_BITS( 7, 0, fovr);
+ else
+ r->pwm = GET_BITS(23, 16, fan1);
+
+ return sizeof(*r);
+}
+
+
+int
+mr_get_ecc(void * p)
+{
+ struct mr_rsp_ecc * r;
+
+ r = (struct mr_rsp_ecc *) p;
+ *r = ecc;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_trbo(void * p)
+{
+ struct mr_rsp_trbo * r;
+
+ r = (struct mr_rsp_trbo *) p;
+ *r = trbo;
+ return sizeof(*r);
+}
+
+
+int
+mr_get_pmcfg(void * p)
+{
+ struct mr_rsp_pmcfg * r;
+
+ r = (struct mr_rsp_pmcfg *) p;
+ *r = pmcfg;
+ return sizeof(*r);
+}
+
+
+/*
+**
+** Card specific 'Set' functions
+** Input screening takes place here (to the extent possible).
+**
+*/
+
+
+int
+mr_set_volt(void * p)
+{
+ uint32_t cv, msk, new, val;
+ uint8_t vid;
+ int i;
+
+ /*
+ * Ensure it's a supported value
+ */
+ val = *(uint32_t *) p;
+ for(i = 0; i < MR_PTAB_LEN; i++)
+ if (volt.supt[i] == val)
+ break;
+ if (i == MR_PTAB_LEN)
+ return -MR_ERR_RANGE;
+
+ /*
+ * Read-modify-write the core voltage VID register
+ */
+ vid = volt2vid(val);
+ cv = mr_sbox_rl(0, SBOX_COREVOLT);
+ msk = ~PUT_BITS(7, 0, ~0);
+ new = (cv & msk) | PUT_BITS(7, 0, vid);
+ mr_sbox_wl(0, SBOX_COREVOLT, new);
+ printk("SetVolt: %d -> %08x (%08x)\n", val, new, cv);
+
+ return 0;
+}
+
+
+int
+mr_set_freq(void * p)
+{
+ uint32_t cf, msk, new, val;
+ uint8_t rat;
+ int i;
+
+ /*
+ * Ensure it's a supported value
+ */
+ val = *(uint32_t *) p;
+ for(i = 0; i < MR_PTAB_LEN; i++)
+ if (freq.supt[i] == val)
+ break;
+ if (i == MR_PTAB_LEN)
+ return -MR_ERR_RANGE;
+
+ /*
+ * Read-modify-write the core frequency PLL register
+ *
+ *TBD: or should we just overwrite it?
+ * Register fields (of relevance):
+ * 7:0 New PLL encoding
+ * 16 Async Operation
+ * 31 Override fuse setting
+ */
+ rat = freq2ratio(val/1000, cpu_tab, ARRAY_SIZE(cpu_tab));
+ cf = mr_sbox_rl(0, SBOX_COREFREQ);
+ msk = ~(PUT_BITS(7, 0, ~0) | PUT_BIT(16, 1) | PUT_BIT(31, 1));
+ new = (cf & msk) | PUT_BITS(7, 0, rat) | PUT_BIT(31, 1);
+ mr_sbox_wl(0, SBOX_COREFREQ, new);
+ printk("SetFreq: %d -> %08x (%08x)\n", val, new, cf);
+
+ /*
+ *TBD:
+ * We just changed the system's base clock without
+ * re-calibrating the APIC timer tick counters.
+ * There is probably a function call for the cpu-freq
+ * driver to deal with this, so should we call it?
+ */
+
+ return 0;
+}
+
+
+int
+mr_set_plim(void * p)
+{
+ plim.phys = *(uint32_t *) p;
+ return 0;
+}
+
+
+int
+mr_set_fan(void * p)
+{
+ struct mr_set_fan * fc;
+
+ /*
+ * Ensure operation is valid, i.e. no garbage
+ * in override flag (only 1 and 0 allowed) and
+ * that pwm is not zero (or above lower limit?)
+ */
+ fc = (struct mr_set_fan *) p;
+ if (GET_BITS(7, 1, fc->override) || !fc->pwm)
+ return -MR_ERR_RANGE;
+
+#if USE_FSC
+ {
+ uint32_t dat;
+
+ /*
+ * Craft the default OVERRIDE command and write it to FSC
+ * through the MGBR register (command 0).
+ * This does not change the telemetry in MGBSR, so only way
+ * to ensure it gets registered by FSC is to wait it out
+ * (happens in fsc_mgbr_write function).
+ */
+ if (fc->override)
+ dat = PUT_BIT(9, 1) | fc->pwm;
+ else
+ dat = 0;
+ fsc_mgbr_write(MR_FSC_MGBR_OVR_CMD, &dat);
+ }
+#else
+ /*
+ * Read-modify-write the fan override register
+ * Control of fan #1 only, don't touch #2
+ */
+ {
+ uint32_t fcor, fco1, fco2;
+
+ fcor = mr_sbox_rl(0, SBOX_SPEED_OVERRIDE_FAN);
+ fco2 = GET_BITS(31, 16, fcor);
+ if (fc->override)
+ fco1 = PUT_BIT(15, 1) | fc->pwm;
+ else
+ fco1 = 0;
+ mr_sbox_wl(0, SBOX_SPEED_OVERRIDE_FAN,
+ PUT_BITS(31, 16, fco2) | PUT_BITS(15, 0, fco1));
+ }
+#endif
+
+ return 0;
+}
+
+
+int
+mr_set_trbo(void * p)
+{
+ return 0;
+}
+
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * RAS module driver
+ *
+ * Contains code to handle module install/deinstall
+ * and handling proper registration(s) to SCIF, sysfs
+ * pseudo file system, timer ticks, I2C driver and
+ * other one-time tasks.
+ */
+
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/sysfs.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/bitmap.h>
+#include <linux/cpumask.h>
+#include <linux/io.h>
+#include <linux/cred.h>
+#include <asm/msr.h>
+#include <asm/mce.h>
+#include <asm/apic.h>
+#include <asm/mic/mic_common.h>
+#include <asm/mic/mic_knc/autobaseaddress.h>
+#include <asm/mic/mic_knc/micsboxdefine.h>
+#include <scif.h>
+#include "micras.h"
+
+#if MT_VERBOSE || MC_VERBOSE || PM_VERBOSE
+/*
+ * For making scif_epd_t non-opague
+ */
+#define _MIC_MICBASEDEFINE_REGISTERS_H_ 1
+#include <mic/micscif.h>
+#endif
+
+/*
+** Lookup table to map API opcode into MT function.
+**
+** As we have to deal with both KnF and KnC, functions to
+** retrieve information may be generic, in micras_common.c,
+** or platform specific, in micras_kn{cf}.c.
+** Code location is transparent to this table.
+**
+** Some MT functions can safely be called without
+** serialization, e.g. if they are read-only or use
+** atomics to get/set variables. The 'simple' flag tells
+** which functions are safe to call without serialization.
+** Other functions should be called thru micras_mt_call().
+**
+** See micras_api.h and micpm_api.h for function details.
+*/
+
+static struct fnc_tab fnc_map[] = {
+ { 0, 0, 0, 0 },
+ { MR_REQ_HWINF, 1, 0, mr_get_hwinf },
+ { MR_REQ_VERS, 1, 0, mr_get_vers },
+ { MR_REQ_CFREQ, 0, 0, mr_get_freq },
+ { MR_SET_CFREQ, 0, 1, mr_set_freq },
+ { MR_REQ_CVOLT, 0, 0, mr_get_volt },
+ { MR_SET_CVOLT, 0, 1, mr_set_volt },
+ { MR_REQ_PWR, 0, 0, mr_get_power },
+ { MR_REQ_PLIM, 0, 0, mr_get_plim },
+ { MR_SET_PLIM, 0, 1, mr_set_plim },
+ { MR_REQ_CLST, 0, 0, mr_get_clst },
+ { MR_ENB_CORE, 0, 1, 0 },
+ { MR_DIS_CORE, 0, 1, 0 },
+ { MR_REQ_GDDR, 1, 0, mr_get_gddr },
+ { MR_REQ_GFREQ, 1, 0, mr_get_gfreq },
+ { MR_SET_GFREQ, 1, 1, 0 },
+ { MR_REQ_GVOLT, 1, 0, mr_get_gvolt },
+ { MR_SET_GVOLT, 1, 1, 0 },
+ { MR_REQ_TEMP, 0, 0, mr_get_temp },
+ { MR_REQ_FAN, 0, 0, mr_get_fan },
+ { MR_SET_FAN, 0, 1, mr_set_fan },
+ { MR_REQ_ECC, 1, 0, mr_get_ecc },
+ { MR_SET_ECC, 0, 1, 0 },
+ { MR_REQ_TRC, 1, 0, mr_get_trc },
+ { MR_SET_TRC, 1, 1, mr_set_trc },
+ { MR_REQ_TRBO, 0, 0, mr_get_trbo },
+ { MR_SET_TRBO, 0, 1, mr_set_trbo },
+ { MR_REQ_OCLK, 0, 0, 0 },
+ { MR_SET_OCLK, 0, 1, 0 },
+ { MR_REQ_CUTL, 0, 0, mr_get_cutl },
+ { MR_REQ_MEM, 0, 0, mr_get_mem },
+ { MR_REQ_OS, 0, 0, mr_get_os },
+ { MR_REQ_PROC, 0, 0, mr_get_proc },
+ { MR_REQ_THRD, 0, 0, 0 },
+ { MR_REQ_PVER, 1, 0, mr_get_pver },
+ { MR_CMD_PKILL, 0, 1, mr_cmd_pkill },
+ { MR_CMD_UKILL, 0, 1, mr_cmd_ukill },
+#if defined(CONFIG_MK1OM)
+ { MR_GET_SMC, 0, 0, mr_get_smc },
+ { MR_SET_SMC, 0, 0, mr_set_smc },
+#else
+#if defined(CONFIG_ML1OM) && USE_FSC
+ { MR_GET_SMC, 0, 0, mr_get_fsc },
+ { MR_SET_SMC, 0, 1, mr_set_fsc },
+#else
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+#endif
+#endif
+ { MR_REQ_PMCFG, 0, 0, mr_get_pmcfg },
+#if defined(CONFIG_MK1OM)
+ { MR_REQ_LED, 0, 0, mr_get_led },
+ { MR_SET_LED, 0, 1, mr_set_led },
+ { MR_REQ_PROCHOT, 0, 0, mr_get_prochot },
+ { MR_SET_PROCHOT, 0, 1, mr_set_prochot },
+ { MR_REQ_PWRALT, 0, 0, mr_get_pwralt },
+ { MR_SET_PWRALT, 0, 1, mr_set_pwralt },
+ { MR_REQ_PERST, 0, 0, mr_get_perst },
+ { MR_SET_PERST, 0, 1, mr_set_perst },
+ { MR_REQ_TTL, 0, 0, mr_get_ttl },
+#else
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+#endif
+#if defined(CONFIG_MK1OM) && USE_PM
+ { 0, 0, 0, 0 },
+ { PM_REQ_PL0, 1, 0, pm_get_pl0 },
+ { PM_SET_PL0, 1, 1, pm_set_pl0 },
+ { PM_REQ_PL1, 1, 0, pm_get_pl1 },
+ { PM_SET_PL1, 1, 1, pm_set_pl1 },
+ { PM_REQ_PAVG, 1, 0, pm_get_pavg },
+ { PM_REQ_PTTL, 1, 0, pm_get_pttl },
+ { PM_REQ_VOLT, 1, 0, pm_get_volt },
+ { PM_REQ_TEMP, 1, 0, pm_get_temp },
+ { PM_REQ_TACH, 1, 0, pm_get_tach },
+ { PM_REQ_TTTL, 1, 0, pm_get_tttl },
+ { PM_REQ_FTTL, 1, 0, pm_get_fttl },
+ { PM_SET_FTTL, 1, 1, pm_set_fttl },
+#endif
+};
+
+
+
+/*
+**
+** The monitoring thread.
+** In fact this is a work_queue, that receive work items
+** from several independent parties, such as SCIF, sysfs,
+** out of band telemetry, PM and possibly timers.
+**
+** These parties pass a structure with information necessary
+** for the call-out function called by the MT thread to operate.
+** These structures must include the work item structure, such
+** that the container_of() mechanism can be used to locate it.
+**
+** The MT thread does not by itself provide any feed-back on
+** when a task was executed nor the results from it. Therefore
+** if a feedback is requred, then the callout needs to provide
+** their own methods, such as the wait queue used by function
+** micras_mt_data() below. Experiments has shown that it is not
+** safe to place work item or the wait queue on a stack (no
+** idea why, could be a bug).
+**
+*/
+
+static int micras_stop; /* Module shutdown */
+static struct delayed_work micras_wq_init; /* Setup work item */
+static struct delayed_work micras_wq_tick; /* Timer tick token */
+static struct workqueue_struct * micras_wq; /* Monitor thread */
+ int micras_priv; /* Call-out privileged */
+
+
+typedef struct wq_task {
+ int req; /* Request opcode */
+ int rtn; /* Return value */
+ int priv; /* Privileged */
+ void * ptr; /* Response buffer */
+ int (* fnc)(void *); /* Call out */
+ struct work_struct wrk; /* Work item */
+ wait_queue_head_t wqh; /* Wait queue header */
+} WqTask;
+
+
+#if defined(CONFIG_MK1OM) && WA_4845465
+/*
+ * SMC die temp update job.
+ *
+ * As per HSD #4845465 we push the die temperature
+ * to the SMC instead of the usual reverse direction.
+ * This has to happen at around 50 mSec intervals, which should
+ * be possible with a work queue implementation. If that turns out
+ * not to be reliable enough we may need a more direct approach.
+ * During the experiment, we want to override the pushed temp.
+ */
+
+#define DIE_PROC 1 /* Enable die temp override */
+#define SMC_PERIOD 50 /* SMC update interval, mSec */
+#define JITTER_STATS 1 /* Enable jitter measurements */
+
+static struct delayed_work micras_wq_smc; /* SMC update token */
+static int smc_4845465; /* SMC push capable */
+#if DIE_PROC
+static int die_override; /* Temperature override */
+#endif
+
+static void
+micras_mt_smc(struct work_struct *work)
+{
+ extern int mr_smc_wr(uint8_t, uint32_t *);
+ static uint64_t n;
+ uint32_t tmp;
+ uint32_t ts2, mfs;
+
+ if (! micras_stop) {
+ /*
+ * Re-arm for a callback in about 1 second.
+ * There is no guarantee this will be more than approximate.
+ */
+ queue_delayed_work(micras_wq, &micras_wq_smc, msecs_to_jiffies(SMC_PERIOD));
+ }
+
+#if JITTER_STATS
+ /*
+ * Time the interval in order to get some
+ * measurement on what jitter to expect.
+ * Leave a log message once every minute.
+ */
+ {
+ static uint64_t d, t1, t2, s, hi, lo = ~0;
+
+ t2 = rdtsc();
+ if (n) {
+ d = t2 - t1;
+ s += d;
+ if (d > hi)
+ hi = d;
+ if (d < lo)
+ lo = d;
+#if 1
+ {
+ /*
+ * Show jitter in buckets representing 5 mSec.
+ * The center (#20) represent +- 2.5 mSec from reference.
+ * It is assumed TSC running at 1.1 GHz here, if PM kicks
+ * in the mesurements may be way off because it manipulate
+ * the system clock and indirectly the jiffy counter.
+ * It is assumed TSC running at 1.1 GHz here.
+ */
+ static uint64_t buckets[41];
+ int bkt;
+ int64_t err;
+
+ err = ((d * 10) / 11) - (50 * 1000 * 1000);
+ if (err < -(25 * 100 * 1000))
+ bkt = 19 + (err + (25 * 100 * 1000)) / (5 * 1000 * 1000);
+ else
+ if (err > (25 * 100 * 1000))
+ bkt = 21 + (err - (25 * 100 * 1000)) / (5 * 1000 * 1000);
+ else
+ bkt = 20;
+ if (bkt < 0)
+ bkt = 0;
+ if (bkt > 40)
+ bkt = 40;
+ buckets[bkt]++;
+ if ((n % ((10 * 1000)/SMC_PERIOD)) == ((10 * 1000)/SMC_PERIOD) - 1) {
+ printk("smc_upd: dist");
+ for(bkt = 0; bkt < 41; bkt++) {
+ if (bkt == 20)
+ printk(" | %lld |", buckets[bkt]);
+ else
+ printk(" %lld", buckets[bkt]);
+ }
+ printk("\n");
+ }
+ }
+#endif
+ if ((n % ((60 * 1000)/SMC_PERIOD)) == ((60 * 1000)/SMC_PERIOD) - 1)
+ printk("smc_upd: %lld, min %lld, max %lld, avg %lld\n", n, lo, hi, s / n);
+ }
+ t1 = t2;
+ }
+#endif /* JITTER_STATS */
+
+ /*
+ * Send update to SMC to register 0x50.
+ * The value to push at the SMC must have following content
+ *
+ * Bits 9:0 Device Temperature
+ * -> THERMAL_STATUS_2 bits 19:10
+ * Bit 10 Valid bit
+ * -> THERMAL_STATUS_2 bit 31
+ * Bits 20:11 Thermal Monitor Control value
+ * -> THERMAL_STATUS_2 bits 9:0
+ * Bits 30:21 Fan Thermal Control value
+ * -> MICROCONTROLLER_FAN_STATUS bits 17:8
+ */
+
+ n++;
+ ts2 = mr_sbox_rl(0, SBOX_THERMAL_STATUS_2);
+ mfs = mr_sbox_rl(0, SBOX_MICROCONTROLLER_FAN_STATUS);
+
+#if DIE_PROC
+ if (die_override)
+ tmp = GET_BITS(9, 0, die_override);
+ else
+#endif
+ tmp = PUT_BITS(9, 0, GET_BITS(19, 10, ts2));
+ tmp |= PUT_BIT(10, GET_BIT(31, ts2)) |
+ PUT_BITS(20, 11, GET_BITS(9, 0, ts2)) |
+ PUT_BITS(30, 21, GET_BITS(17, 8, mfs));
+
+ if (mr_smc_wr(0x50, &tmp))
+ printk("smc_upd: %lld, tmp %d, SMC write failed\n", n, tmp);
+}
+
+
+#if DIE_PROC
+/*
+ * Test proc file to override die temperature push.
+ * A value of 0 means no override, any other value is
+ * pushed as if it was a 'device temperature'.
+ */
+
+static struct proc_dir_entry * die_pe;
+
+/*
+ * On writes: scan input line for single number.
+ */
+
+static ssize_t
+die_write(struct file * file, const char __user * buff, size_t len, loff_t * off)
+{
+ char * buf;
+ char * ep, * cp;
+ unsigned long ull;
+ int err;
+
+ /*
+ * Get input line into kernel space
+ */
+ if (len > PAGE_SIZE -1)
+ len = PAGE_SIZE -1;
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (! buf)
+ return -ENOMEM;
+ if (copy_from_user(buf, buff, len)) {
+ err = -EFAULT;
+ goto wr_out;
+ }
+ buf[len] = '\0';
+ cp = ep = (char *) buf;
+
+ /*
+ * Read a number in strtoul format 0.
+ */
+ while(isspace(*cp))
+ cp++;
+ ull = simple_strtoull(cp, &ep, 0);
+ if (ep == cp || (*ep != '\0' && !isspace(*ep))) {
+ printk("Invalid die temp given\n");
+ err = -EINVAL;
+ goto wr_out;
+ }
+
+ die_override = GET_BITS(9, 0, ull);
+ printk("Die temp override set to %d C\n", die_override);
+
+ /*
+ * Swallow any trailing junk up to next newline
+ */
+ ep = strchr(buf, '\n');
+ if (ep)
+ cp = ep + 1;
+ err = cp - buf;
+
+wr_out:
+ kfree(buf);
+ return err;
+}
+
+
+/*
+ * On reads: return string of current override temp.
+ */
+
+static ssize_t
+die_read(struct file * file, char __user * buff, size_t count, loff_t *ppos)
+{
+ char buf[32];
+ size_t len;
+
+ len = snprintf(buf, sizeof(buf), "%d\n", die_override);
+ return simple_read_from_buffer(buff, count, ppos, buf, len);
+}
+
+
+static const struct file_operations proc_die_operations = {
+ .read = die_read,
+ .write = die_write,
+ .llseek = no_llseek,
+};
+#endif /* DIE_PROC */
+#endif /* WA_4845465 */
+
+
+/*
+ * Timer tick job
+ *
+ * This is for periodic updates from the SMC,
+ * which (with a little luck) can be avoided
+ * at the cost of I2C communications during
+ * actual CP queries.
+ */
+
+static void
+micras_mt_tick(struct work_struct *work)
+{
+#if MT_TIMER
+ static int n;
+
+ n++;
+ if (! micras_stop) {
+ /*
+ * Re-arm for a callback in about 1 second.
+ * There is no guarantee this will be more than approximate.
+ */
+ queue_delayed_work(micras_wq, &micras_wq_tick, msecs_to_jiffies(MT_PERIOD));
+ }
+
+ /*
+ * Dump elog prints into the kernel log
+ *TBD: debug tool, time-shifts messages, remove eventually.
+ */
+ {
+ int msg_top, msg_id;
+ char * buf;
+
+ msg_id = atomic_read(&ee_seen);
+ msg_top = atomic_read(&ee_msg);
+ while(++msg_id <= msg_top) {
+ buf = ee_buf + (msg_id % EE_BUF_COUNT) * EE_BUF_LINELEN;
+ if (! *buf)
+ break;
+ printk("%s", buf);
+ *buf = '\0';
+ atomic_inc(&ee_seen);
+ }
+ }
+#endif
+}
+
+
+/*
+ * Handle SCIF & sysfs show/store requests
+ *
+ * By convention we know that the work item is member of
+ * a larger struct, which can readily be found using the
+ * container_of mechanism.
+ *
+ * Otherwise this routine just calls the function stored
+ * in the larger struct's mt_data element, and on its
+ * return wake up whoever is waiting for it's completion.
+ */
+
+static void
+micras_mt_data(struct work_struct * work)
+{
+ struct wq_task * wq;
+
+ wq = container_of(work, struct wq_task, wrk);
+ micras_priv = wq->priv;
+ wq->rtn = wq->fnc(wq->ptr);
+ micras_priv = 0;
+ wake_up_all(& wq->wqh);
+}
+
+
+/*
+ * Helper to pass jobs (work items) to the monitoring thread.
+ *
+ * As input it receives details on function to be called, one
+ * argument to pass to that function, the opcode associated
+ * with the function and a function return value. The latter
+ * will be set to -MR_ERR_PEND, and we'll expect the callout
+ * function to change it.
+ *
+ * The work item is the only piece of information passed to
+ * the work queue callout, so we'll wrap it into a larger
+ * structure along with the received details such that the
+ * work queue can perform a function call on our behalf.
+ */
+
+static int
+micras_mt_tsk(struct wq_task * wq)
+{
+ int err;
+
+#if MT_VERBOSE
+ uint64_t start, stop;
+ start = rdtsc();
+#endif
+
+ /*
+ * Create a work item for the RAS thread,
+ * enqueue and wait for it's completion.
+ *
+ *TBD: Timeout length to be revisited
+ */
+ wq->rtn = -MR_ERR_PEND;
+ INIT_WORK_ONSTACK(&wq->wrk, micras_mt_data);
+ init_waitqueue_head(&wq->wqh);
+ queue_work(micras_wq, &wq->wrk);
+ err = wait_event_interruptible_timeout(wq->wqh,
+ wq->rtn != -MR_ERR_PEND, msecs_to_jiffies(1000));
+
+ /*
+ * Check for potential errors, which for now can only be
+ * "interrupted" or "timeout". In both cases try cancel the work
+ * item from MT thread. If cancel succeds (returns true) then
+ * the work item was still "pending" and is now removed from the
+ * work queue, i.e. it is safe to continue (with error).
+ * Otherwise, the cancel operation will wait for the work item's
+ * call-out function to finish, which kind of defies the purpose
+ * of "interruptable". However, we cannot leave until it is certain
+ * that it will not be accessed by the RAS thread.
+ */
+ if (err == -ERESTARTSYS || err == 0) {
+ printk("MT tsk: interrupted or failure, err %d\n", err);
+ printk("MT tsk: FAILED: cmd %d, rtn %d, fnc %p, ptr %p\n",
+ wq->req, wq->rtn, wq->fnc, wq->ptr);
+
+ err = cancel_work_sync(&wq->wrk);
+ printk("MT tsk: work canceled (%d)\n", err);
+ }
+
+ /*
+ * Completed, turn interrupts and timeouts into MR errors.
+ */
+ err = wq->rtn;
+ if (err == -MR_ERR_PEND)
+ err = -MR_ERR_NOVAL;
+
+#if MT_VERBOSE
+ stop = rdtsc();
+ printk("MT tsk: cmd %d, err %d, time %llu\n", wq->req, err, stop - start);
+#endif
+ return err;
+}
+
+
+/*
+ * Public interface to the MT functions
+ * Caller responsible for passing a buffer large enough
+ * to hold data for reads or writes (1 page will do,
+ * but structs matching the commands are recommended).
+ * Returned data are structs defined in micras.h
+ */
+
+int
+micras_mt_call(uint16_t cmd, void * buf)
+{
+ struct wq_task * wq;
+ int err;
+
+ if (micras_stop)
+ return -MR_ERR_UNSUP;
+
+ if (cmd > MR_REQ_MAX)
+ return -MR_ERR_INVOP;
+
+ err = -MR_ERR_UNSUP;
+ if (fnc_map[cmd].fnc) {
+ if (fnc_map[cmd].simple) {
+ /*
+ * Fast access, just call function
+ */
+ err = fnc_map[cmd].fnc(buf);
+ }
+ else {
+ /*
+ * Slow access, go through serializer.
+ * We allocate a work queue task for the MT thread,
+ * stuff arguments in it, run task, and then free
+ * work queue task.
+ */
+ wq = kmalloc(sizeof(* wq), GFP_KERNEL);
+ if (! wq) {
+ printk("Scif: CP work task alloc failed\n");
+ return -MR_ERR_NOMEM;
+ }
+
+ memset(wq, '\0', sizeof(*wq));
+ wq->req = cmd;
+ wq->priv = 1;
+ wq->fnc = (int (*)(void *)) fnc_map[cmd].fnc;
+ wq->ptr = buf;
+ err = micras_mt_tsk(wq);
+
+ kfree(wq);
+ }
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(micras_mt_call);
+
+
+
+/*
+**
+** The sysfs nodes provided by this module is not really associated
+** with a 'struct device', since we don't create device entries for
+** access through '/dev'. Instead we register a 'struct class'
+** with nodes defined with the CLASS_ATTR macro.
+** Reasons for this choice are:
+** - we don't want a device node created
+** - we don't need (at least now) to create udev events
+** - we don't act on suspend/resume transitions
+** - we don't want to have our files unnecessarily deep
+** in the sysfs file system.
+**
+** The sysfs layout is intended to look like:
+**
+** /sys/class/micras/ Root of this driver
+** /clst Core information
+** /cutl Core utilization
+** /ecc Error correction mode
+** /fan Fan controller
+** /freq Core frequency
+** /gddr GDDR devices
+** /gfreq GDDR speed
+** /gvolt GDDR voltage
+** /hwinf Hardware Info
+** /mem Memory utilization
+** /os OS status
+** /plim Power envelope
+** /power Card power
+** /temp Board tempearatures
+** /trbo Turbo mode
+** /trc Trace level
+** /vers uOS/Flash version
+** /volt Core voltage
+**
+** The following should be removed as there are better tools
+** available in /proc/<pid>/{stat|status|smap}, /proc/meminfo,
+** /proc/stat, /proc/uptime, /proc/loadavg, and /proc/cpuinfo:
+** clst, cutl, mem, os
+**
+** Below we hand-craft a 'micras' class to sit under '/sys/class'
+** with attribute nodes directly under it. Each attribute may
+** have a 'show' and a 'store' handler, both called with a reference
+** to its class (ras_class, may hold private data), it's class_attribute,
+** a buffer reference, and for 'store's a string length. The buffer
+** passed to 'show' is one page (PAGE_SIZE, 4096) which sets the
+** upper limit on the return string(s). Return value of 'store'
+** has to be either an error code (negative) or the count of bytes
+** consumed. If consumed less than what's passed in, the store routine
+** will be called again until all input data has been consumed.
+**
+** Function pointers are hardwired by the macros below since it
+** is easy and simpler than using the fnc_map table. This may
+** change if the command set expands uncontrolled.
+** We have local helper funtions to handle array prints.
+** Any locking required is handled in called routines, not here.
+**
+** Note: This is not coded for maximum performance, since the
+** use of the MT thread to serialize access to card data
+** has a cost of two task switches attached, both which
+** may cause delays due to other system activity.
+**
+*/
+
+
+/*
+ * Hack alert!
+ * Formatting routines for arrays of 16/32/64 bit unsigned ints.
+ * This reduces the printf argument list in _SHOW() macros below
+ * considerably, though perhaps at a cost in code efficiency.
+ * They need a scratch buffer in order to construct long lines.
+ * A quick swag at the largest possible response tells that we'll
+ * never exceed half if the page we are given to scribble into.
+ * So, instead of allocating print space, we'll simply use 2nd
+ * half of the page as scratch buffer.
+ */
+
+#define BP (buf + (PAGE_SIZE/2)) /* Scratch pad location */
+#define BL (PAGE_SIZE/2 - 1) /* Scratch size */
+
+
+static char *
+arr16(int16_t * arr, int len, char * buf, int siz)
+{
+ int n, bs;
+
+ bs = 0;
+ for(n = 0; n < len && bs < siz; n++)
+ bs += scnprintf(buf + bs, siz - bs, "%s%u", n ? " " : "", arr[n]);
+ buf[bs] = '\0';
+
+ return buf;
+}
+
+
+static char *
+arr32(uint32_t * arr, int len, char * buf, int siz)
+{
+ int n, bs;
+
+ bs = 0;
+ for(n = 0; n < len && bs < siz; n++)
+ bs += scnprintf(buf + bs, siz - bs, "%s%u", n ? " " : "", arr[n]);
+ buf[bs] = '\0';
+
+ return buf;
+}
+
+
+static char *
+arr64(uint64_t * arr, int len, char * buf, int siz)
+{
+ int n, bs;
+
+ bs = 0;
+ for(n = 0; n < len && bs < siz; n++)
+ bs += scnprintf(buf + bs, siz - bs, "%s%llu", n ? " " : "", arr[n]);
+ buf[bs] = '\0';
+
+ return buf;
+}
+
+
+#define _SHOW(op,rec,nam,str...) \
+ static ssize_t \
+ micras_show_##nam(struct class *class, \
+ struct class_attribute *attr, \
+ char *buf) \
+ { \
+ struct mr_rsp_##rec * r; \
+ struct wq_task * wq; \
+ int len; \
+ int err; \
+\
+ wq = kmalloc(sizeof(* wq) + sizeof(* r), GFP_KERNEL); \
+ if (! wq) \
+ return -ENOMEM; \
+\
+ memset(wq, '\0', sizeof(* wq)); \
+ r = (struct mr_rsp_##rec *)(wq + 1); \
+ wq->req = MR_REQ_##op; \
+ wq->fnc = (int (*)(void *)) mr_get_##nam; \
+ wq->ptr = r; \
+ err = micras_mt_tsk(wq); \
+\
+ if (err < 0) { \
+ len = 0; \
+ *buf = '\0'; \
+ } \
+ else { \
+ len = scnprintf(buf, PAGE_SIZE, ##str); \
+ } \
+\
+ kfree(wq); \
+ return len; \
+ }
+
+_SHOW(HWINF, hwinf, hwinf, "%u %u %u %u %u %u "
+ "%c%c%c%c%c%c%c%c%c%c%c%c "
+ "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+ r->rev, r->step, r->substep, r->board, r->fab, r->sku,
+ r->serial[0], r->serial[1], r->serial[2], r->serial[3],
+ r->serial[4], r->serial[5], r->serial[6], r->serial[7],
+ r->serial[8], r->serial[9], r->serial[10], r->serial[11],
+ r->guid[0], r->guid[1], r->guid[2], r->guid[3],
+ r->guid[4], r->guid[5], r->guid[6], r->guid[7],
+ r->guid[8], r->guid[9], r->guid[10], r->guid[11],
+ r->guid[12], r->guid[13], r->guid[14], r->guid[15]);
+
+_SHOW(VERS, vers, vers, "\"%s\" \"%s\" \"%s\" \"%s\" \"%s\" \"%s\" \"%s\"\n",
+ r->fboot0 +1, r->fboot1 +1, r->flash[0] +1,
+ r->flash[1] +1, r->flash[2] +1, r->fsc +1, r->uos +1)
+
+_SHOW(CFREQ, freq, freq, "%u %u %s\n",
+ r->cur, r->def, arr32(r->supt, r->slen, BP, BL))
+
+_SHOW(CVOLT, volt, volt, "%u %u %s\n",
+ r->cur, r->set, arr32(r->supt, r->slen, BP, BL))
+
+#if defined(CONFIG_MK1OM) || (defined(CONFIG_ML1OM) && USE_FSC)
+_SHOW(PWR, power, power, "%d\n%d\n%d\n%d\n%d\n%d\n%d\n%s\n%s\n%s\n",
+ r->tot0.prr,
+ r->tot1.prr,
+ r->inst.prr,
+ r->imax.prr,
+ r->pcie.prr,
+ r->c2x3.prr,
+ r->c2x4.prr,
+ arr32(&r->vccp.pwr, 3, BP, 32),
+ arr32(&r->vddg.pwr, 3, BP + 32, 32),
+ arr32(&r->vddq.pwr, 3, BP + 64, 32))
+
+_SHOW(PLIM, plim, plim, "%u %u %u\n",
+ r->phys, r->hmrk, r->lmrk)
+#endif
+
+_SHOW(CLST, clst, clst, "%u %u\n",
+ r->count, r->thr)
+
+_SHOW(GDDR, gddr, gddr, "\"%s\" %u %u %u\n",
+ r->dev +1, r->rev, r->size, r->speed)
+
+_SHOW(GFREQ, gfreq, gfreq, "%u %u\n",
+ r->cur, r->def)
+
+_SHOW(GVOLT, gvolt, gvolt, "%u %u\n",
+ r->cur, r->set)
+
+_SHOW(TEMP, temp, temp, "%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n",
+ arr16(&r->die.cur, 2, BP, 32),
+ arr16(&r->brd.cur, 2, BP + 32, 32),
+ arr16(&r->fin.cur, 2, BP + 64, 32),
+ arr16(&r->fout.cur, 2, BP + 96, 32),
+ arr16(&r->gddr.cur, 2, BP + 128, 32),
+ arr16(&r->vccp.cur, 2, BP + 160, 32),
+ arr16(&r->vddg.cur, 2, BP + 224, 32),
+ arr16(&r->vddq.cur, 2, BP + 256, 32))
+
+_SHOW(FAN, fan, fan, "%u %u %u\n",
+ r->override, r->pwm, r->rpm)
+
+#ifdef CONFIG_MK1OM
+_SHOW(ECC, ecc, ecc, "%d\n",
+ r->enable)
+#endif
+
+_SHOW(TRC, trc, trc, "%d\n",
+ r->lvl)
+
+_SHOW(TRBO, trbo, trbo, "%d %d %d\n",
+ r->set, r->state, r->avail)
+
+#ifdef CONFIG_MK1OM
+_SHOW(LED, led, led, "%d\n",
+ r->led)
+
+_SHOW(PROCHOT, ptrig, prochot, "%d %d\n",
+ r->power, r->time);
+
+_SHOW(PWRALT, ptrig, pwralt, "%d %d\n",
+ r->power, r->time);
+
+_SHOW(PERST, perst, perst, "%d\n",
+ r->perst);
+
+_SHOW(TTL, ttl, ttl, "%u %u %u %u\n%u %u %u %u\n%u %u %u %u\n",
+ r->thermal.active, r->thermal.since, r->thermal.count, r->thermal.time,
+ r->power.active, r->power.since, r->power.count, r->power.time,
+ r->alert.active, r->alert.since, r->alert.count, r->alert.time);
+#endif
+
+_SHOW(CUTL, cutl, cutl, "%u %u %u %llu\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n...\n",
+ r->tck, r->core, r->thr, r->jif,
+ arr64(&r->sum.user, 4, BP, 80),
+ arr64(&r->cpu[0].user, 4, BP + 80, 80),
+ arr64(&r->cpu[1].user, 4, BP + 160, 80),
+ arr64(&r->cpu[2].user, 4, BP + 240, 80),
+ arr64(&r->cpu[3].user, 4, BP + 320, 80),
+ arr64(&r->cpu[4].user, 4, BP + 400, 80),
+ arr64(&r->cpu[5].user, 4, BP + 480, 80),
+ arr64(&r->cpu[6].user, 4, BP + 560, 80),
+ arr64(&r->cpu[7].user, 4, BP + 640, 80))
+
+_SHOW(MEM, mem, mem, "%u %u %u\n",
+ r->total, r->free, r->bufs)
+
+_SHOW(OS, os, os, "%llu %llu %llu %llu %u [%s]\n",
+ r->uptime, r->loads[0], r->loads[1], r->loads[2],
+ r->alen, arr32(r->apid, r->alen, BP, BL))
+
+
+/*
+ * Ensure caller's creditials is root on all 'set' files.
+ * Even though file creation mode should prevent writes?
+ *
+ *TBD:
+ * - How many of the 'store's are to be permitted?
+ */
+
+#define _STORE(op, nam) \
+ static ssize_t \
+ micras_store_##nam (struct class *class, \
+ struct class_attribute *attr, \
+ const char *buf, \
+ size_t count) \
+ { \
+ struct wq_task * wq; \
+ size_t ocount; \
+ uint32_t val; \
+ int err; \
+ char * ep; \
+\
+ if (current_euid() != 0) \
+ return -EPERM; \
+\
+ ocount = count; \
+ if (count && buf[count - 1] == '\n') \
+ ((char *) buf)[--count] = '\0'; \
+\
+ err = -EINVAL; \
+ if (count && *buf) { \
+ val = simple_strtoul(buf, &ep, 0); \
+ if (ep != buf && !*ep) { \
+ wq = kmalloc(sizeof(* wq), GFP_KERNEL); \
+ if (! wq) \
+ return -ENOMEM; \
+\
+ wq->req = MR_SET_##op; \
+ wq->fnc = (int (*)(void *)) mr_set_##nam; \
+ wq->ptr = (void *) &val; \
+ if (! micras_mt_tsk(wq)) \
+ err = ocount; \
+ kfree(wq); \
+ } \
+ } \
+\
+ return err; \
+ }
+
+_STORE(CFREQ, freq)
+_STORE(CVOLT, volt)
+
+#if defined(CONFIG_MK1OM) || (defined(CONFIG_ML1OM) && USE_FSC)
+_STORE(PLIM, plim)
+#endif
+
+_STORE(FAN, fan)
+_STORE(TRC, trc)
+_STORE(TRBO, trbo)
+
+#ifdef CONFIG_MK1OM
+_STORE(LED, led)
+_STORE(PERST, perst)
+#endif
+
+
+/*
+ *TBD:
+ * - Remove entries clst, cutl, mem, and os.
+ * Only included here for comparison with what cp/micinfo displays.
+ * They really need to go.
+ */
+
+static struct class_attribute micras_attr[] = {
+ __ATTR(hwinf, 0444, micras_show_hwinf, 0),
+ __ATTR(vers, 0444, micras_show_vers, 0),
+ __ATTR(freq, 0644, micras_show_freq, micras_store_freq),
+ __ATTR(volt, 0644, micras_show_volt, micras_store_volt),
+#if defined(CONFIG_MK1OM) || (defined(CONFIG_ML1OM) && USE_FSC)
+ __ATTR(power, 0444, micras_show_power, 0),
+ __ATTR(plim, 0644, micras_show_plim, micras_store_plim),
+#endif
+ __ATTR(clst, 0444, micras_show_clst, 0),
+ __ATTR(gddr, 0444, micras_show_gddr, 0),
+ __ATTR(gfreq, 0444, micras_show_gfreq, 0),
+ __ATTR(gvolt, 0444, micras_show_gvolt, 0),
+ __ATTR(fan, 0644, micras_show_fan, micras_store_fan),
+ __ATTR(temp, 0444, micras_show_temp, 0),
+#ifdef CONFIG_MK1OM
+ __ATTR(ecc, 0444, micras_show_ecc, 0),
+#endif
+ __ATTR(trc, 0644, micras_show_trc, micras_store_trc),
+ __ATTR(trbo, 0644, micras_show_trbo, micras_store_trbo),
+#ifdef CONFIG_MK1OM
+ __ATTR(led, 0644, micras_show_led, micras_store_led),
+ __ATTR(prochot, 0444, micras_show_prochot, 0),
+ __ATTR(pwralt, 0444, micras_show_pwralt, 0),
+ __ATTR(perst, 0644, micras_show_perst, micras_store_perst),
+ __ATTR(ttl, 0444, micras_show_ttl, 0),
+#endif
+ __ATTR(cutl, 0444, micras_show_cutl, 0),
+ __ATTR(mem, 0444, micras_show_mem, 0),
+ __ATTR(os, 0444, micras_show_os, 0),
+ __ATTR_NULL,
+};
+
+
+static struct class ras_class = {
+ .name = "micras",
+ .owner = THIS_MODULE,
+ .class_attrs = micras_attr,
+};
+
+
+
+/*
+**
+** SCIF interface & services are mostly handled here, including
+** all aspects of setting up and tearing down SCIF channels.
+** We create three listening SCIF sockets and create a workqueue
+** with the initial task of waiting for 'accept's to happen.
+**
+** When TTL or MC accept incoming connections, their workqueue
+** task spawns one thread just to detect if/when peer closes
+** the session and will block any further connects until thes
+** service thread terminates (peer closes session).
+** The TTL or MC event handler, executing in interrupt context,
+** will check for an open session and if one is present, deliver
+** their event record(s) on it by using scif_send().
+**
+** When CP accept incoming connections, its workqueue task spawns
+** a new thread to run a session with the peer and then proceeds
+** to accepting a new connection. Thus, there are no strict
+** bounds on number of incoming connections, but for internal
+** house-keeping sessions are limited to MR_SCIF_MAX (32).
+** Accepted requests from the peer are fulfilled through the
+** MT thread in a similar fashion as the sysctl interface, i.e.
+** though function micras_mt_tsk(), who guarantee synchronized
+** (serialized) access to MT core data and handle waits as needed.
+** Function pointers corresponding to request opcodes are found
+** by lookup in the fnc_map table.
+**
+** Note: This is not coded for maximum performance, since the
+** use of the MT thread to serialize access to card data
+** has a cost of two task switches attached, both which
+** may cause delays due to other system activity.
+*/
+
+
+static scif_epd_t micras_cp_lstn; /* CP listener handle */
+static struct workqueue_struct * micras_cp_wq; /* CP listener thread */
+static atomic_t micras_cp_rst; /* CP listener restart */
+static struct delayed_work micras_cp_tkn; /* CP accept token */
+static DECLARE_BITMAP(micras_cp_fd, MR_SCIF_MAX); /* CP free slots */
+static volatile struct scif_portID micras_cp_si[MR_SCIF_MAX]; /* CP sessions */
+static volatile struct task_struct * micras_cp_kt[MR_SCIF_MAX]; /* CP threads */
+static volatile scif_epd_t micras_cp_ep[MR_SCIF_MAX]; /* CP handles */
+
+static scif_epd_t micras_mc_lstn; /* MC listener handle */
+static struct workqueue_struct * micras_mc_wq; /* MC listener thread */
+static struct delayed_work micras_mc_tkn; /* MC accept token */
+static volatile struct task_struct * micras_mc_kt; /* MC session */
+static volatile scif_epd_t micras_mc_ep; /* MC handle */
+
+static scif_epd_t micras_ttl_lstn; /* TTL listener handle */
+static struct workqueue_struct * micras_ttl_wq; /* TTL listener thread */
+static struct delayed_work micras_ttl_tkn; /* TTL accept token */
+static volatile struct task_struct * micras_ttl_kt; /* TTL session */
+static volatile scif_epd_t micras_ttl_ep; /* TTL handle */
+
+
+/*
+ * SCIF CP session thread
+ */
+
+static int
+micras_cp_sess(void * _slot)
+{
+ struct wq_task * wq;
+ struct mr_hdr q, a;
+ scif_epd_t ep;
+ uint32_t slot;
+ void * buf;
+ uint64_t start, stop;
+ int blen, len, priv;
+
+ slot = (uint32_t)((uint64_t) _slot);
+ priv = (micras_cp_si[slot].port < 1024) ? 1 : 0;
+#if MT_VERBOSE
+ printk("Scif: CP session %d running%s\n", slot, priv ? " privileged" : "");
+#endif
+
+ /*
+ * Allocate local buffer from kernel
+ * Since the I/O buffers in SCIF is just one page,
+ * we'd never expect to need larger buffers here.
+ */
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (! buf) {
+ printk("Scif: CP scratch pad alloc failed\n");
+ return 0;
+ }
+
+ /*
+ * Allocate a work queue task for the MT thread
+ */
+ wq = kmalloc(sizeof(* wq), GFP_KERNEL);
+ if (! wq) {
+ printk("Scif: CP work task alloc failed\n");
+ goto cp_sess_end;
+ }
+
+ /*
+ * Start servicing MT protocol
+ */
+ ep = micras_cp_ep[slot];
+ for( ;; ) {
+
+ /*
+ * Get a message header
+ */
+ len = scif_recv(ep, &q, sizeof(q), SCIF_RECV_BLOCK);
+ start = rdtsc();
+ if (len < 0) {
+ if (len != -ECONNRESET)
+ printk("Scif: CP recv error %d\n", len);
+ goto cp_sess_end;
+ }
+ if (len != sizeof(q)) {
+ printk("Scif: CP short recv (%d), discarding\n", len);
+ continue;
+ }
+
+ /*
+ * Validate the query:
+ * - known good opcode,
+ * - expected length (zero)
+ * - have callout in jump table
+ * - check requestor's port ID on privileged opcodes.
+ *
+ *TBD: opcodes above MR_REQ_MAX is really only meant for
+ * use by the PM module. Should it be host accessible?
+ */
+ blen = 0;
+ if (q.cmd < MR_REQ_HWINF ||
+#if defined(CONFIG_MK1OM) && USE_PM
+ q.cmd > PM_REQ_MAX
+#else
+ q.cmd > MR_REQ_MAX
+#endif
+ ) {
+ printk("Scif: CP opcode %d invalid\n", q.cmd);
+ blen = -MR_ERR_INVOP;
+ }
+ else
+ if (q.len != 0) {
+ printk("Scif: CP command length %d invalid\n", q.len);
+ blen = -MR_ERR_INVLEN;
+ }
+ else
+ if (! fnc_map[q.cmd].fnc) {
+ printk("Scif: CP opcode %d un-implemented\n", q.cmd);
+ blen = -MR_ERR_UNSUP;
+ }
+ else
+ if (fnc_map[q.cmd].privileged && !priv) {
+ printk("Scif: CP opcode %d privileged, remote %d:%d\n",
+ q.cmd, micras_cp_si[slot].node, micras_cp_si[slot].port);
+ blen = -MR_ERR_PERM;
+ }
+
+ /*
+ *TBD: If there is an error at this point, it might
+ * be a good idea to drain the SCIF channel.
+ * If garbage has entered the channel somehow,
+ * then how else can we get in sync such that
+ * next recv really is a command header?
+ * More radical solution is closing this session.
+ */
+
+ /*
+ * If header is OK (blen still zero) then pass
+ * a work queue item to MT and wait for response.
+ * The result will end up in buf (payload for response)
+ * or an error code that can be sent back to requestor.
+ * Since we don't want to care about whether it is a
+ * get or set command here, the 'parm' value is copied
+ * into buf prior to passing the work item to MT.
+ * Thus, functions expecting an 'uint32_t *' to
+ * point to a new value will be satisfied.
+ */
+ if (blen == 0) {
+ if (fnc_map[q.cmd].simple) {
+ *((uint32_t *) buf) = q.parm;
+ blen = fnc_map[q.cmd].fnc(buf);
+ }
+ else {
+ memset(wq, '\0', sizeof(*wq));
+ wq->req = q.cmd;
+ wq->priv = priv;
+ wq->fnc = (int (*)(void *)) fnc_map[q.cmd].fnc;
+ wq->ptr = buf;
+ *((uint32_t *) buf) = q.parm;
+ blen = micras_mt_tsk(wq);
+ }
+ }
+ stop = rdtsc();
+
+ /*
+ * Craft response header
+ */
+ a.cmd = q.cmd | MR_RESP;
+ if (blen < 0) {
+ /*
+ * MT thread reported a failure.
+ * Set error bit and make error record in buf
+ */
+ a.cmd |= MR_ERROR;
+ ((struct mr_err *) buf)->err = -blen;
+ ((struct mr_err *) buf)->len = 0;
+ a.len = sizeof(struct mr_err);
+ }
+ else {
+ /*
+ * Payload size is set by call-out
+ */
+ a.len = blen;
+ }
+ a.stamp = q.stamp;
+ a.spent = stop - start;
+
+ /*
+ * Send response header (always)
+ */
+ len = scif_send(ep, &a, sizeof(a), SCIF_SEND_BLOCK);
+ if (len < 0) {
+ printk("Scif: header send error %d\n", len);
+ goto cp_sess_end;
+ }
+ if (len != sizeof(a)) {
+ printk("Scif: CP short header send (%d of %lu)\n", len, sizeof(a));
+ goto cp_sess_end;
+ }
+
+ /*
+ * Send payload (if any, defined by a.len)
+ */
+ if (a.len > 0) {
+ len = scif_send(ep, buf, a.len, SCIF_SEND_BLOCK);
+ if (len < 0) {
+ printk("Scif: CP payload send error %d\n", len);
+ goto cp_sess_end;
+ }
+ if (len != a.len) {
+ printk("Scif: CP short payload send (%d of %d)\n", len, a.len);
+ goto cp_sess_end;
+ }
+ }
+
+ }
+
+cp_sess_end:
+ if (wq)
+ kfree(wq);
+ if (buf)
+ kfree(buf);
+ ep = (scif_epd_t) atomic64_xchg((atomic64_t *)(micras_cp_ep + slot), 0);
+ if (ep)
+ scif_close(ep);
+ micras_cp_kt[slot] = 0;
+ set_bit(slot, micras_cp_fd);
+#if MT_VERBOSE
+ printk("Scif: CP session %d terminated, sess mask %lx\n", slot, micras_cp_fd[0]);
+#endif
+
+ if (atomic_xchg(&micras_cp_rst, 0)) {
+ printk("Scif: resume listener\n");
+ queue_delayed_work(micras_cp_wq, &micras_cp_tkn, 0);
+ }
+
+ return 0;
+}
+
+
+/*
+ * SCIF CP session launcher
+ */
+
+static void
+micras_cp(struct work_struct * work)
+{
+ struct task_struct * thr;
+ scif_epd_t sess_ep;
+ struct scif_portID sess_id;
+ int slot;
+ int err;
+
+ /*
+ * Wait for somebody to connect to us
+ * We stop listening on any error whatsoever
+ */
+ err = scif_accept(micras_cp_lstn, &sess_id, &sess_ep, SCIF_ACCEPT_SYNC);
+ if (err == -EINTR) {
+ printk("Scif: CP accept interrupted, error %d\n", err);
+ return;
+ }
+ if (err < 0) {
+ printk("Scif: CP accept failed, error %d\n", err);
+ return;
+ }
+#if MT_VERBOSE
+ printk("Scif: CP accept: remote %d:%d, local %d:%d\n",
+ sess_id.node, sess_id.port,
+ micras_cp_lstn->port.node, micras_cp_lstn->port.port);
+#endif
+
+ /*
+ * Spawn a new thread to run session with connecting peer
+ * We support only a limited number of connections, so first
+ * get a free "slot" for this session.
+ * The use of non-atomic ffs() below is safe as long as this
+ * function is never run by more than one thread at a time
+ * and all other manipulations of micras_cp_fd are atomic.
+ */
+ slot = find_first_bit(micras_cp_fd, MR_SCIF_MAX);
+ if (slot < MR_SCIF_MAX) {
+ if (micras_cp_kt[slot] || micras_cp_ep[slot]) {
+ printk("Scif: CP slot %d busy (bug)\n", slot);
+ return;
+ }
+
+ clear_bit(slot, micras_cp_fd);
+ micras_cp_ep[slot] = sess_ep;
+ micras_cp_si[slot] = sess_id;
+ thr = kthread_create(micras_cp_sess, (void *)(uint64_t) slot, "RAS CP svc %d", slot);
+ if (IS_ERR(thr)) {
+ printk("Scif: CP service thread creation failed\n");
+ scif_close(sess_ep);
+ micras_cp_ep[slot] = 0;
+ set_bit(slot, micras_cp_fd);
+ return;
+ }
+ micras_cp_kt[slot] = thr;
+#if MT_VERBOSE
+ printk("Scif: CP session %d launched, pid %d\n", slot, thr->pid);
+#endif
+ wake_up_process(thr);
+ }
+ else {
+ printk("Scif: No open session slots, closing session\n");
+ scif_close(sess_ep);
+ }
+
+ /*
+ * Keep listening until session limit reached.
+ */
+ if (bitmap_weight(micras_cp_fd, MR_SCIF_MAX))
+ queue_delayed_work(micras_cp_wq, &micras_cp_tkn, 0);
+ else {
+ printk("Scif: CP connection limit reached\n");
+ atomic_xchg(&micras_cp_rst, 1);
+ }
+}
+
+
+/*
+ * SCIF MC session thread
+ */
+
+static int
+micras_mc_sess(void * dummy)
+{
+ scif_epd_t ep;
+ char buf[8];
+ int len;
+
+#if MC_VERBOSE
+ printk("Scif: MC session running\n");
+#endif
+
+ /*
+ * Start servicing.
+ * This is just to get indication if peer closes connection
+ */
+ for( ;; ) {
+ /*
+ * Sync with kernel MC event log.
+ */
+ mcc_sync();
+
+ /*
+ * Try read 1 byte from host (turns into a wait-point
+ * keeping the connection open till host closes it)
+ */
+ len = scif_recv(micras_mc_ep, buf, 1, SCIF_RECV_BLOCK);
+ if (len < 0) {
+ if (len != -ECONNRESET)
+ printk("Scif: MC recv error %d\n", len);
+ goto mc_sess_end;
+ }
+
+ /*
+ * Ignore any received content.
+ */
+ }
+
+mc_sess_end:
+ ep = (scif_epd_t) atomic64_xchg((atomic64_t *) &micras_mc_ep, 0);
+ if (ep)
+ scif_close(ep);
+ micras_mc_kt = 0;
+#if MC_VERBOSE
+ printk("Scif: MC session terminated\n");
+#endif
+ return 0;
+}
+
+
+/*
+ * SCIF MC session launcher
+ */
+
+static void
+micras_mc(struct work_struct * work)
+{
+ struct task_struct * thr;
+ scif_epd_t sess_ep;
+ struct scif_portID sess_id;
+ int err;
+
+ /*
+ * Wait for somebody to connect to us
+ * We stop listening on any error whatsoever
+ */
+ err = scif_accept(micras_mc_lstn, &sess_id, &sess_ep, SCIF_ACCEPT_SYNC);
+ if (err == -EINTR) {
+ printk("Scif: MC accept interrupted, error %d\n", err);
+ return;
+ }
+ if (err < 0) {
+ printk("Scif: MC accept failed, error %d\n", err);
+ return;
+ }
+#if MC_VERBOSE
+ printk("Scif: MC accept: remote %d:%d, local %d:%d\n",
+ sess_ep->peer.node, sess_ep->peer.port,
+ sess_ep->port.node, sess_ep->port.port);
+#endif
+
+ /*
+ * Spawn a new thread to run session with connecting peer
+ * We support only one connection, so if one already is
+ * running this one will be rejected.
+ */
+ if (! micras_mc_ep) {
+ micras_mc_ep = sess_ep;
+ thr = kthread_create(micras_mc_sess, 0, "RAS MC svc");
+ if (IS_ERR(thr)) {
+ printk("Scif: MC service thread creation failed\n");
+ scif_close(sess_ep);
+ micras_mc_ep = 0;
+ return;
+ }
+ micras_mc_kt = thr;
+ wake_up_process(thr);
+ }
+ else {
+ printk("Scif: MC connection limit reached\n");
+ scif_close(sess_ep);
+ }
+
+ /*
+ * Keep listening
+ */
+ queue_delayed_work(micras_mc_wq, &micras_mc_tkn, 0);
+}
+
+
+/*
+ * Ship a pre-packaged machine check event record to host
+ */
+
+#ifndef SCIF_BLAST
+#define SCIF_BLAST 2
+#endif
+
+int
+micras_mc_send(struct mce_info * mce, int exc)
+{
+ if (micras_mc_ep) {
+ int err;
+
+#if ADD_DIE_TEMP
+ err = mr_sbox_rl(0, SBOX_THERMAL_STATUS_2);
+ mce->flags |= PUT_BITS(15, 8, GET_BITS(19, 10, err));
+#endif
+
+ if (exc) {
+ /*
+ * Exception context SCIF access, can't sleep and can't
+ * wait on spinlocks either. May be detrimental to
+ * other scif communications, but this _is_ an emergency
+ * and we _do_ need to ship this message to the host.
+ */
+ err = scif_send(micras_mc_ep, mce, sizeof(*mce), SCIF_BLAST);
+ if (err != sizeof(*mce))
+ ee_printk("micras_mc_send: scif_send failed, err %d\n", err);
+ }
+ else {
+ /*
+ * Thread context SCIF access.
+ * Just send message.
+ */
+ err = scif_send(micras_mc_ep, mce, sizeof(*mce), SCIF_SEND_BLOCK);
+ if (err != sizeof(*mce))
+ printk("micras_mc_send: scif_send failed, err %d\n", err);
+ }
+ return err == sizeof(*mce);
+ }
+ return 0;
+}
+
+
+/*
+ * SCIF TTL session thread
+ */
+
+static int
+micras_ttl_sess(void * dummy)
+{
+ scif_epd_t ep;
+ char buf[8];
+ int len;
+
+#if PM_VERBOSE
+ printk("Scif: TTL session running\n");
+#endif
+
+ /*
+ * Start servicing.
+ * This is just to get indication if peer closes connection
+ */
+ for( ;; ) {
+ /*
+ * Try read 1 byte from host (turns into a wait-point
+ * keeping the connection open till host closes it)
+ */
+ len = scif_recv(micras_ttl_ep, buf, 1, SCIF_RECV_BLOCK);
+ if (len < 0) {
+ if (len != -ECONNRESET)
+ printk("Scif: TTL recv error %d\n", len);
+ goto ttl_sess_end;
+ }
+
+ /*
+ * Ignore any received content.
+ */
+ }
+
+ttl_sess_end:
+ ep = (scif_epd_t) atomic64_xchg((atomic64_t *) &micras_ttl_ep, 0);
+ if (ep)
+ scif_close(ep);
+ micras_ttl_kt = 0;
+#if PM_VERBOSE
+ printk("Scif: TTL session terminated\n");
+#endif
+ return 0;
+}
+
+
+/*
+ * SCIF TTL session launcher
+ */
+
+static void
+micras_ttl(struct work_struct * work)
+{
+ struct task_struct * thr;
+ scif_epd_t sess_ep;
+ struct scif_portID sess_id;
+ int err;
+
+ /*
+ * Wait for somebody to connect to us
+ * We stop listening on any error whatsoever
+ */
+ err = scif_accept(micras_ttl_lstn, &sess_id, &sess_ep, SCIF_ACCEPT_SYNC);
+ if (err == -EINTR) {
+ printk("Scif: TTL accept interrupted, error %d\n", err);
+ return;
+ }
+ if (err < 0) {
+ printk("Scif: TTL accept failed, error %d\n", err);
+ return;
+ }
+#if PM_VERBOSE
+ printk("Scif: TTL accept: remote %d:%d, local %d:%d\n",
+ sess_ep->peer.node, sess_ep->peer.port,
+ sess_ep->port.node, sess_ep->port.port);
+#endif
+
+ /*
+ * Spawn a new thread to run session with connecting peer
+ * We support only one connection, so if one already is
+ * running this one will be rejected.
+ */
+ if (! micras_ttl_ep) {
+ micras_ttl_ep = sess_ep;
+ thr = kthread_create(micras_ttl_sess, 0, "RAS TTL svc");
+ if (IS_ERR(thr)) {
+ printk("Scif: TTL service thread creation failed\n");
+ scif_close(sess_ep);
+ micras_ttl_ep = 0;
+ return;
+ }
+ micras_ttl_kt = thr;
+ wake_up_process(thr);
+ }
+ else {
+ printk("Scif: TTL connection limit reached\n");
+ scif_close(sess_ep);
+ }
+
+ /*
+ * Keep listening
+ */
+ queue_delayed_work(micras_ttl_wq, &micras_ttl_tkn, 0);
+}
+
+
+/*
+ * Ship a pre-packaged throttle event record to host
+ */
+
+void
+micras_ttl_send(struct ttl_info * ttl)
+{
+ static struct ttl_info split_rec;
+ static int split_rem;
+ int err;
+ char * cp;
+
+ if (micras_ttl_ep) {
+
+ if (split_rem) {
+ cp = ((char *) &split_rec) + (sizeof(*ttl) - split_rem);
+ err = scif_send(micras_ttl_ep, cp, split_rem, 0);
+ if (err == split_rem) {
+ /*
+ * Tx of pendig buffer complete
+ */
+ split_rem = 0;
+ }
+ else {
+ if (err < 0) {
+ /*
+ * SCIF failed squarely, just drop the message.
+ * TBD: close end point?
+ */
+ }
+ else {
+ /*
+ * Another partial send
+ */
+ split_rem -= err;
+ }
+ }
+ }
+
+ if (! split_rem) {
+ /*
+ * Send message
+ */
+ err = scif_send(micras_ttl_ep, ttl, sizeof(*ttl), 0);
+ if (err != sizeof(*ttl)) {
+ /*
+ * Did not send all the message
+ */
+ if (err < 0) {
+ /*
+ * SCIF failed squarely, drop the message.
+ * TBD: close end point?
+ */
+ }
+ else {
+ split_rec = *ttl;
+ split_rem = sizeof(*ttl) - err;
+ }
+ }
+ }
+ }
+}
+
+
+
+/*
+**
+** MMIO regions used by RAS module
+** Until some common strategy on access to BOXes and other CSRs
+** we'll map them ourselves. All MMIO accesses are performed
+** through 32 bit unsigned integers, but a 64 bit abstraction
+** is provided for convenience (low 32 bit done first).
+**
+** We need access to the SBOX, all GBOXs, TBOXs and DBOXs.
+**
+** Note: I2C driver code for exception context in micras_elog.c
+** has its own set of I/O routines in order to allow
+** separate debugging.
+**
+*/
+
+uint8_t * micras_sbox; /* SBOX mmio region */
+uint8_t * micras_dbox[DBOX_NUM]; /* DBOX mmio region */
+uint8_t * micras_gbox[GBOX_NUM]; /* GBOX mmio regions */
+#ifdef CONFIG_MK1OM
+uint8_t * micras_tbox[TBOX_NUM]; /* TBOX mmio regions */
+#endif
+
+/*
+ * Specials: some defines are currently missing
+ */
+
+#ifdef CONFIG_MK1OM
+#define DBOX1_BASE 0x0800620000ULL
+
+#define GBOX4_BASE 0x08006D0000ULL
+#define GBOX5_BASE 0x08006C0000ULL
+#define GBOX6_BASE 0x08006B0000ULL
+#define GBOX7_BASE 0x08006A0000ULL
+#endif
+
+
+/*
+ * MMIO I/O dumpers (for debug)
+ * Exception mode code needs to use the ee_print dumpers
+ * because printk is not safe to use (works most of the time
+ * though, but may hang the system eventually).
+ */
+#if 0
+#if 0
+extern atomic_t pxa_block;
+#define RL if (! atomic_read(&pxa_block)) ee_print("%s: %4x -> %08x\n", __FUNCTION__, roff, val)
+#define RQ if (! atomic_read(&pxa_block)) ee_print("%s: %4x -> %016llx\n", __FUNCTION__, roff, val)
+#define WL if (! atomic_read(&pxa_block)) ee_print("%s: %4x <- %08x\n", __FUNCTION__, roff, val)
+#define WQ if (! atomic_read(&pxa_block)) ee_print("%s: %4x <- %016llx\n", __FUNCTION__, roff, val)
+#else
+#define RL printk("%s: %4x -> %08x\n", __FUNCTION__, roff, val)
+#define RQ printk("%s: %4x -> %016llx\n", __FUNCTION__, roff, val)
+#define WL printk("%s: %4x <- %08x\n", __FUNCTION__, roff, val)
+#define WQ printk("%s: %4x <- %016llx\n", __FUNCTION__, roff, val)
+#endif
+#else
+#define RL /* As nothing */
+#define RQ /* As nothing */
+#define WL /* As nothing */
+#define WQ /* As nothing */
+#endif
+
+
+/*
+ * SBOX MMIO I/O routines
+ * mr_sbox_base Return SBOX MMIO region
+ * mr_sbox_rl Read 32-bit register
+ * mr_sbox_rq Read 64-bit register (really two 32-bit reads)
+ * mr_sbox_wl Write 32-bit register
+ * mr_sbox_wq Write 64-bit register (really two 32-bit writes)
+ */
+
+#if NOT_YET
+uint8_t *
+mr_sbox_base(int dummy)
+{
+ return micras_sbox;
+}
+#endif
+
+uint32_t
+mr_sbox_rl(int dummy, uint32_t roff)
+{
+ uint32_t val;
+
+ val = * (volatile uint32_t *)(micras_sbox + roff);
+ RL;
+ return val;
+}
+
+uint64_t
+mr_sbox_rq(int dummy, uint32_t roff)
+{
+ uint32_t hi, lo;
+ uint64_t val;
+
+ lo = * (volatile uint32_t *)(micras_sbox + roff);
+ hi = * (volatile uint32_t *)(micras_sbox + roff + 4);
+ val = ((uint64_t) hi << 32) | (uint64_t) lo;
+ RQ;
+ return val;
+}
+
+void
+mr_sbox_wl(int dummy, uint32_t roff, uint32_t val)
+{
+ WL;
+ * (volatile uint32_t *)(micras_sbox + roff) = val;
+}
+
+void
+mr_sbox_wq(int dummy, uint32_t roff, uint64_t val)
+{
+ uint32_t hi, lo;
+
+ WQ;
+ lo = val;
+ hi = val >> 32;
+
+ * (volatile uint32_t *)(micras_sbox + roff) = lo;
+ * (volatile uint32_t *)(micras_sbox + roff + 4) = hi;
+}
+
+
+/*
+ * DBOX MMIO I/O routines
+ * mr_dbox_base Return DBOX MMIO region
+ * mr_dbox_rl Read 32-bit register
+ * mr_dbox_rq Read 64-bit register (really two 32-bit reads)
+ * mr_dbox_wl Write 32-bit register
+ * mr_dbox_wq Write 64-bit register (really two 32-bit writes)
+ */
+
+#if NOT_YET
+uint8_t *
+mr_dbox_base(int unit)
+{
+ return micras_dbox[unit];
+}
+#endif
+
+uint32_t
+mr_dbox_rl(int unit, uint32_t roff)
+{
+ uint32_t val;
+
+ val = * (volatile uint32_t *)(micras_dbox[unit] + roff);
+ RL;
+ return val;
+}
+
+uint64_t
+mr_dbox_rq(int unit, uint32_t roff)
+{
+ uint32_t hi, lo;
+ uint64_t val;
+
+ lo = * (volatile uint32_t *)(micras_dbox[unit] + roff);
+ hi = * (volatile uint32_t *)(micras_dbox[unit] + roff + 4);
+ val = ((uint64_t) hi << 32) | (uint64_t) lo;
+ RQ;
+ return val;
+}
+
+void
+mr_dbox_wl(int unit, uint32_t roff, uint32_t val)
+{
+ WL;
+ * (volatile uint32_t *)(micras_dbox[unit] + roff) = val;
+}
+
+void
+mr_dbox_wq(int unit, uint32_t roff, uint64_t val)
+{
+ uint32_t hi, lo;
+
+ WQ;
+ lo = val;
+ hi = val >> 32;
+
+ * (volatile uint32_t *)(micras_dbox[unit] + roff) = lo;
+ * (volatile uint32_t *)(micras_dbox[unit] + roff + 4) = hi;
+}
+
+
+/*
+ * GBOX MMIO I/O routines
+ * mr_gbox_base Return GBOX MMIO region
+ * mr_gbox_rl Read 32-bit register
+ * mr_gbox_rq Read 64-bit register (really two 32-bit reads)
+ * mr_gbox_wl Write 32-bit register
+ * mr_gbox_wq Write 64-bit register (really two 32-bit writes)
+ *
+ * Due to a Si bug, MMIO writes can be dropped by the GBOXs
+ * during heavy DMA activity (HSD #4844222). The risk of it
+ * happening is low enough that a 'repeat until it sticks'
+ * workaround is sufficient. No 'read' issues so far.
+ *
+ *TBD: Ramesh asked that GBOX MMIOs check for sleep states.
+ * Not sure how to do that, but here is a good spot to
+ * add such check, as all GBOX access comes thru here.
+ */
+
+#if NOT_YET
+uint8_t *
+mr_gbox_base(int unit)
+{
+ return micras_gbox[unit];
+}
+#endif
+
+uint32_t
+mr_gbox_rl(int unit, uint32_t roff)
+{
+ uint32_t val;
+
+ val = * (volatile uint32_t *)(micras_gbox[unit] + roff);
+ RL;
+ return val;
+}
+
+uint64_t
+mr_gbox_rq(int unit, uint32_t roff)
+{
+ uint32_t hi, lo;
+ uint64_t val;
+
+ lo = * (volatile uint32_t *)(micras_gbox[unit] + roff);
+ if (roff == 0x5c) {
+ /*
+ * Instead of placing HI part of MCA_STATUS
+ * at 0x60 to form a natural 64-bit register,
+ * it located at 0xac, against all conventions.
+ */
+ hi = * (volatile uint32_t *)(micras_gbox[unit] + 0xac);
+ }
+ else
+ hi = * (volatile uint32_t *)(micras_gbox[unit] + roff + 4);
+ val = ((uint64_t) hi << 32) | (uint64_t) lo;
+ RQ;
+ return val;
+}
+
+void
+mr_gbox_wl(int unit, uint32_t roff, uint32_t val)
+{
+#if !GBOX_WORKING
+ {
+ int rpt;
+ uint32_t rb;
+
+ /*
+ * Due to bug HSD 4844222 loop until value sticks
+ */
+ for(rpt = 10; rpt-- ; ) {
+#endif
+
+ WL;
+ * (volatile uint32_t *)(micras_gbox[unit] + roff) = val;
+
+#if !GBOX_WORKING
+ rb = mr_gbox_rl(unit, roff);
+ if (rb == val)
+ break;
+ }
+ }
+#endif
+}
+
+void
+mr_gbox_wq(int unit, uint32_t roff, uint64_t val)
+{
+ uint32_t hi, lo;
+
+ lo = val;
+ hi = val >> 32;
+
+#if !GBOX_WORKING
+ {
+ int rpt;
+ uint64_t rb;
+
+ /*
+ * Due to bug HSD 4844222 loop until value sticks
+ * Note: this may result in bad things happening if
+ * wrinting to a MMIO MCA STATUS register
+ * since there is a non-zero chance that the
+ * NMI handler can fire and change the register
+ * inside this loop. Require that the caller
+ * is on same CPU as the NMI handler (#0).
+ */
+ for(rpt = 10; rpt-- ; ) {
+#endif
+
+ WQ;
+ * (volatile uint32_t *)(micras_gbox[unit] + roff) = lo;
+ if (roff == 0x5c) {
+ /*
+ * Instead of placing HI part of MCA_STATUS
+ * at 0x60 to form a natural 64-bit register,
+ * it located at 0xac, against all conventions.
+ */
+ * (volatile uint32_t *)(micras_gbox[unit] + 0xac) = hi;
+ }
+ else
+ * (volatile uint32_t *)(micras_gbox[unit] + roff + 4) = hi;
+
+#if !GBOX_WORKING
+ rb = mr_gbox_rq(unit, roff);
+ if (rb == val)
+ break;
+ }
+ }
+#endif
+}
+
+
+#ifdef CONFIG_MK1OM
+/*
+ * TBOX MMIO I/O routines
+ * mr_tbox_base Return TBOX MMIO region
+ * mr_tbox_rl Read 32-bit register
+ * mr_tbox_rq Read 64-bit register (really two 32-bit reads)
+ * mr_tbox_wl Write 32-bit register
+ * mr_tbox_wq Write 64-bit register (really two 32-bit writes)
+ *
+ * Some SKUs don't have TBOXs, in which case the
+ * micras_tbox array will contain null pointers.
+ * We do not test for this here, but expect that
+ * caller either know what he's doing or consult
+ * the mr_tbox_base() function first.
+ */
+
+#if NOT_YET
+uint8_t *
+mr_tbox_base(int unit)
+{
+ return micras_tbox[unit];
+}
+#endif
+
+uint32_t
+mr_tbox_rl(int unit, uint32_t roff)
+{
+ uint32_t val;
+
+ val = * (volatile uint32_t *)(micras_tbox[unit] + roff);
+ RL;
+ return val;
+}
+
+uint64_t
+mr_tbox_rq(int unit, uint32_t roff)
+{
+ uint32_t hi, lo;
+ uint64_t val;
+
+ lo = * (volatile uint32_t *)(micras_tbox[unit] + roff);
+ hi = * (volatile uint32_t *)(micras_tbox[unit] + roff + 4);
+ val = ((uint64_t) hi << 32) | (uint64_t) lo;
+ RQ;
+ return val;
+}
+
+void
+mr_tbox_wl(int unit, uint32_t roff, uint32_t val)
+{
+ WL;
+ * (volatile uint32_t *)(micras_tbox[unit] + roff) = val;
+}
+
+void
+mr_tbox_wq(int unit, uint32_t roff, uint64_t val)
+{
+ uint32_t hi, lo;
+
+ WQ;
+ lo = val;
+ hi = val >> 32;
+
+ * (volatile uint32_t *)(micras_tbox[unit] + roff) = lo;
+ * (volatile uint32_t *)(micras_tbox[unit] + roff + 4) = hi;
+}
+#endif
+
+
+
+/*
+**
+** SMP utilities for CP and MC.
+** The kernel offers routines for MSRs, but as far
+** as I could find then there isn't any for some
+** CPU registers we need, like CR4.
+**
+** rd_cr4_on_cpu Read a CR4 value on CPU
+** set_in_cr4_on_cpu Set bits in CR4 on a CPU
+** clear_in_cr4_on_cpu Guess...
+** rdtsc Read time stamp counter
+**
+**TBD: Special case when CPU happens to be current?
+*/
+
+#if NOT_YET
+static void
+_rd_cr4_on_cpu(void * p)
+{
+ *((uint32_t *) p) = read_cr4();
+}
+
+uint32_t
+rd_cr4_on_cpu(int cpu)
+{
+ uint32_t cr4;
+
+ smp_call_function_single(cpu, _rd_cr4_on_cpu, &cr4, 1);
+ return cr4;
+}
+
+static void
+_set_in_cr4_on_cpu(void * p)
+{
+ uint32_t cr4;
+
+ cr4 = read_cr4();
+ cr4 |= * (uint32_t *) p;
+ write_cr4(cr4);
+}
+
+void
+set_in_cr4_on_cpu(int cpu, uint32_t m)
+{
+ smp_call_function_single(cpu, _set_in_cr4_on_cpu, &m, 1);
+}
+
+static void
+_clear_in_cr4_on_cpu(void * p)
+{
+ uint32_t cr4;
+
+ cr4 = read_cr4();
+ cr4 &= ~ *(uint32_t *) p;
+ write_cr4(cr4);
+}
+
+void
+clear_in_cr4_on_cpu(int cpu, uint32_t m)
+{
+ smp_call_function_single(cpu, _clear_in_cr4_on_cpu, &m, 1);
+}
+#endif
+
+uint64_t
+rdtsc(void) {
+ uint32_t lo, hi;
+ __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+ return ((uint64_t) hi) << 32 | lo;
+}
+
+
+
+/*
+**
+** Module load/unload logic
+**
+*/
+
+
+/*
+ * Startup job (run by MT thread)
+ * Intended to handle tasks that cannot impact
+ * module load status, such as kicking off service
+ * work queues, etc.
+ */
+
+static void
+micras_init2(struct work_struct * work)
+{
+ /*
+ * Make MT one-time setup and kick
+ * off 1 sec timer and SCIF listeners
+ */
+ if (! micras_stop) {
+
+ INIT_DELAYED_WORK(&micras_wq_tick, micras_mt_tick);
+ queue_delayed_work(micras_wq, &micras_wq_tick, msecs_to_jiffies(5000));
+
+ bitmap_fill(micras_cp_fd, MR_SCIF_MAX);
+ INIT_DELAYED_WORK(&micras_cp_tkn, micras_cp);
+ queue_delayed_work(micras_cp_wq, &micras_cp_tkn, 0);
+
+ INIT_DELAYED_WORK(&micras_mc_tkn, micras_mc);
+ queue_delayed_work(micras_mc_wq, &micras_mc_tkn, 0);
+
+ INIT_DELAYED_WORK(&micras_ttl_tkn, micras_ttl);
+ queue_delayed_work(micras_ttl_wq, &micras_ttl_tkn, 0);
+
+#if defined(CONFIG_MK1OM) && WA_4845465 && DIE_PROC
+ if (smc_4845465)
+ die_pe = proc_create("die", 0644, 0, &proc_die_operations);
+#endif
+
+ printk("RAS.init: module operational\n");
+ module_put(THIS_MODULE);
+ }
+}
+
+
+static int __init
+micras_init(void)
+{
+ int i;
+ int err;
+
+ printk("Loading RAS module ver %s. Build date: %s\n", RAS_VER, __DATE__);
+
+ /*
+ * Create work queue for the monitoring thread
+ * and pass it some initial work to start with.
+ */
+#if defined(CONFIG_MK1OM) && WA_4845465
+ micras_wq = alloc_workqueue("RAS MT", WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
+#else
+ micras_wq = create_singlethread_workqueue("RAS MT");
+#endif
+ if (! micras_wq) {
+ err = -ESRCH;
+ printk("RAS.init: cannot start work queue, error %d\n", err);
+ goto fail_wq;
+ }
+
+ /*
+ * Register top sysfs class (directory) and attach attributes (files)
+ * beneath it. No 'device's involved.
+ */
+ err = class_register(&ras_class);
+ if (err) {
+ printk("RAS.init: cannot register class 'micras', error %d\n", err);
+ goto fail_class;
+ }
+
+ /*
+ * Setup CP SCIF port in listening mode
+ */
+ micras_cp_lstn = scif_open();
+ if (! micras_cp_lstn) {
+ printk("RAS.init: cannot get SCIF CP endpoint\n");
+ goto fail_cp;
+ }
+ err = scif_bind(micras_cp_lstn, MR_MON_PORT);
+ if (err < 0) {
+ printk("RAS.init: cannot bind SCIF CP endpoint, error %d\n", err);
+ goto fail_cp_ep;
+ }
+ err = scif_listen(micras_cp_lstn, MR_SCIF_MAX);
+ if (err < 0) {
+ printk("RAS.init: cannot make SCIF CP listen, error %d\n", err);
+ goto fail_cp_ep;
+ }
+ micras_cp_wq = create_singlethread_workqueue("RAS CP listen");
+ if (! micras_cp_wq) {
+ err = -ESRCH;
+ printk("RAS.init: cannot start CP listener work queue, error %d\n", err);
+ goto fail_cp_ep;
+ }
+
+ /*
+ * Setup MC SCIF port in listening mode
+ */
+ micras_mc_lstn = scif_open();
+ if (! micras_mc_lstn) {
+ printk("RAS.init: cannot get SCIF MC endpoint\n");
+ goto fail_mc;
+ }
+ err = scif_bind(micras_mc_lstn, MR_MCE_PORT);
+ if (err < 0) {
+ printk("RAS.init: cannot bind SCIF MC endpoint, error %d\n", err);
+ goto fail_mc_ep;
+ }
+ err = scif_listen(micras_mc_lstn, MR_SCIF_MAX);
+ if (err < 0) {
+ printk("RAS.init: cannot make SCIF MC listen, error %d\n", err);
+ goto fail_mc_ep;
+ }
+ micras_mc_wq = create_singlethread_workqueue("RAS MC listen");
+ if (! micras_mc_wq) {
+ err = -ESRCH;
+ printk("RAS.init: cannot start listener work queue, error %d\n", err);
+ goto fail_mc_ep;
+ }
+
+ /*
+ * Setup TTL SCIF port in listening mode
+ */
+ micras_ttl_lstn = scif_open();
+ if (! micras_ttl_lstn) {
+ printk("RAS.init: cannot get SCIF TTL endpoint\n");
+ goto fail_ttl;
+ }
+ err = scif_bind(micras_ttl_lstn, MR_TTL_PORT);
+ if (err < 0) {
+ printk("RAS.init: cannot bind SCIF TTL endpoint, error %d\n", err);
+ goto fail_ttl_ep;
+ }
+ err = scif_listen(micras_ttl_lstn, MR_SCIF_MAX);
+ if (err < 0) {
+ printk("RAS.init: cannot make SCIF TTL listen, error %d\n", err);
+ goto fail_ttl_ep;
+ }
+ micras_ttl_wq = create_singlethread_workqueue("RAS TTL listen");
+ if (! micras_ttl_wq) {
+ err = -ESRCH;
+ printk("RAS.init: cannot start listener work queue, error %d\n", err);
+ goto fail_ttl_ep;
+ }
+
+ /*
+ * Make the MMIO maps we need.
+ */
+ micras_sbox = ioremap(SBOX_BASE, COMMON_MMIO_BOX_SIZE);
+ if (! micras_sbox)
+ goto fail_iomap;
+
+ micras_dbox[0] = ioremap(DBOX0_BASE, COMMON_MMIO_BOX_SIZE);
+ if (! micras_dbox[0])
+ goto fail_iomap;
+
+#ifdef CONFIG_MK1OM
+ micras_dbox[1] = ioremap(DBOX1_BASE, COMMON_MMIO_BOX_SIZE);
+ if (! micras_dbox[1])
+ goto fail_iomap;
+#endif
+
+ micras_gbox[0] = ioremap(GBOX0_BASE, COMMON_MMIO_BOX_SIZE);
+ micras_gbox[1] = ioremap(GBOX1_BASE, COMMON_MMIO_BOX_SIZE);
+ micras_gbox[2] = ioremap(GBOX2_BASE, COMMON_MMIO_BOX_SIZE);
+ micras_gbox[3] = ioremap(GBOX3_BASE, COMMON_MMIO_BOX_SIZE);
+ if (!micras_gbox[0] || !micras_gbox[1] ||
+ !micras_gbox[2] || !micras_gbox[3])
+ goto fail_iomap;
+
+#ifdef CONFIG_MK1OM
+ micras_gbox[4] = ioremap(GBOX4_BASE, COMMON_MMIO_BOX_SIZE);
+ micras_gbox[5] = ioremap(GBOX5_BASE, COMMON_MMIO_BOX_SIZE);
+ micras_gbox[6] = ioremap(GBOX6_BASE, COMMON_MMIO_BOX_SIZE);
+ micras_gbox[7] = ioremap(GBOX7_BASE, COMMON_MMIO_BOX_SIZE);
+ if (!micras_gbox[4] || !micras_gbox[5] ||
+ !micras_gbox[6] || !micras_gbox[7])
+ goto fail_iomap;
+#endif
+
+#ifdef CONFIG_MK1OM
+ /*
+ * Most SKUs don't have TBOXes.
+ * If not, then don't map to their MMIO space
+ */
+ if (mr_txs()) {
+ micras_tbox[0] = ioremap(TXS0_BASE, COMMON_MMIO_BOX_SIZE);
+ micras_tbox[1] = ioremap(TXS1_BASE, COMMON_MMIO_BOX_SIZE);
+ micras_tbox[2] = ioremap(TXS2_BASE, COMMON_MMIO_BOX_SIZE);
+ micras_tbox[3] = ioremap(TXS3_BASE, COMMON_MMIO_BOX_SIZE);
+ micras_tbox[4] = ioremap(TXS4_BASE, COMMON_MMIO_BOX_SIZE);
+ micras_tbox[5] = ioremap(TXS5_BASE, COMMON_MMIO_BOX_SIZE);
+ micras_tbox[6] = ioremap(TXS6_BASE, COMMON_MMIO_BOX_SIZE);
+ micras_tbox[7] = ioremap(TXS7_BASE, COMMON_MMIO_BOX_SIZE);
+ if (!micras_tbox[0] || !micras_tbox[1] ||
+ !micras_tbox[2] || !micras_tbox[3] ||
+ !micras_tbox[4] || !micras_tbox[5] ||
+ !micras_tbox[6] || !micras_tbox[7])
+ goto fail_iomap;
+ }
+#endif
+
+ /*
+ * Setup non-volatile MC error logging device.
+ */
+ if (ee_init())
+ goto fail_iomap;
+
+ /*
+ * Setup core MC event handler.
+ * If this can't fail, move into micras_wq_init instead.
+ */
+ if (mcc_init())
+ goto fail_ee;
+
+ /*
+ * Setup un-core MC event handler.
+ * If this can't fail, move into micras_wq_init instead.
+ */
+ if (mcu_init())
+ goto fail_core;
+
+ /*
+ * Prepare MT drivers
+ */
+ mr_mt_init();
+
+#if defined(CONFIG_MK1OM) && USE_PM
+ /*
+ * Setup PM interface
+ */
+ if (pm_init())
+ goto fail_uncore;
+#endif
+
+#if defined(CONFIG_MK1OM) && WA_4845465
+ /*
+ * Launch SMC temperature push work.
+ * Supported by SMC firmware later than 121.11 (build 4511).
+ */
+ {
+ extern int mr_smc_rd(uint8_t, uint32_t *);
+ int rev, ref;
+
+ mr_smc_rd(0x11, &rev);
+ if (rev) {
+ ref = PUT_BITS(31, 24, 121) |
+ PUT_BITS(23, 16, 11) |
+ PUT_BITS(15, 0, 4511);
+
+ if (rev >= ref)
+ smc_4845465 = rev;
+ }
+
+ if (smc_4845465) {
+ INIT_DELAYED_WORK(&micras_wq_smc, micras_mt_smc);
+ queue_delayed_work(micras_wq, &micras_wq_smc, 0);
+ printk("RAS.init: HSD 4845465 workaround active, fw %x\n", rev);
+ }
+ else
+ printk("RAS.init: SMC too old for HSD 4845465 workaround, fw %x\n", rev);
+ }
+#endif
+
+ /*
+ * Launch deferable setup work
+ */
+ try_module_get(THIS_MODULE);
+ INIT_DELAYED_WORK(&micras_wq_init, micras_init2);
+ queue_delayed_work(micras_wq, &micras_wq_init, msecs_to_jiffies(500));
+ printk("RAS module load completed\n");
+ return err;
+
+ /*
+ * Error exits: unwind all setup done so far and return failure
+ *
+ *TBD: consider calling exit function. Requires that it can tell
+ * with certainty what has been setup and what hasn't.
+ */
+#if defined(CONFIG_MK1OM) && USE_PM
+fail_uncore:
+ mr_mt_exit();
+ mcu_exit();
+#endif
+fail_core:
+ mcc_exit();
+fail_ee:
+#ifdef CONFIG_MK1OM
+ ee_exit();
+#endif
+fail_iomap:
+ if (micras_sbox)
+ iounmap(micras_sbox);
+ for(i = 0; i < ARRAY_SIZE(micras_dbox); i++)
+ if (micras_dbox[i])
+ iounmap(micras_dbox[i]);
+ for(i = 0; i < ARRAY_SIZE(micras_gbox); i++)
+ if (micras_gbox[i])
+ iounmap(micras_gbox[i]);
+#ifdef CONFIG_MK1OM
+ for(i = 0; i < ARRAY_SIZE(micras_tbox); i++)
+ if (micras_tbox[i])
+ iounmap(micras_tbox[i]);
+#endif
+
+ destroy_workqueue(micras_ttl_wq);
+
+fail_ttl_ep:
+ scif_close(micras_ttl_lstn);
+
+fail_ttl:
+ destroy_workqueue(micras_mc_wq);
+
+fail_mc_ep:
+ scif_close(micras_mc_lstn);
+
+fail_mc:
+ destroy_workqueue(micras_cp_wq);
+
+fail_cp_ep:
+ scif_close(micras_cp_lstn);
+
+fail_cp:
+ class_unregister(&ras_class);
+
+fail_class:
+ micras_stop = 1;
+ flush_workqueue(micras_wq);
+ destroy_workqueue(micras_wq);
+
+fail_wq:
+ printk("RAS module load failed\n");
+ return err;
+}
+
+
+static void __exit
+micras_exit(void)
+{
+ int i;
+ scif_epd_t ep;
+
+ printk("Unloading RAS module\n");
+ micras_stop = 1;
+
+ /*
+ * Disconnect MC event handlers and
+ * close the I2C eeprom interfaces.
+ */
+ mcu_exit();
+ mcc_exit();
+ ee_exit();
+
+ /*
+ * Close SCIF listeners (no more connects).
+ */
+ scif_close(micras_cp_lstn);
+ scif_close(micras_mc_lstn);
+ scif_close(micras_ttl_lstn);
+ msleep(10);
+ destroy_workqueue(micras_cp_wq);
+ destroy_workqueue(micras_mc_wq);
+ destroy_workqueue(micras_ttl_wq);
+
+ /*
+ * Terminate active sessions by closing their end points.
+ * Session threads then should clean up after themselves.
+ */
+ for(i = 0; i < MR_SCIF_MAX; i++) {
+ if (micras_cp_kt[i]) {
+ printk("RAS.exit: force closing CP session %d\n", i);
+ ep = (scif_epd_t) atomic64_xchg((atomic64_t *)(micras_cp_ep + i), 0);
+ if (ep)
+ scif_close(ep);
+ }
+ }
+ for(i = 0; i < 1000; i++) {
+ if (bitmap_weight(micras_cp_fd, MR_SCIF_MAX) == MR_SCIF_MAX)
+ break;
+ msleep(1);
+ }
+ if (micras_mc_kt) {
+ printk("RAS.exit: force closing MC session\n");
+ ep = (scif_epd_t) atomic64_xchg((atomic64_t *) &micras_mc_ep, 0);
+ if (ep)
+ scif_close(ep);
+ for(i = 0; (i < 1000) && micras_mc_kt; i++)
+ msleep(1);
+ }
+ if (micras_ttl_kt) {
+ printk("RAS.exit: force closing TTL session\n");
+ ep = (scif_epd_t) atomic64_xchg((atomic64_t *) &micras_ttl_ep, 0);
+ if (ep)
+ scif_close(ep);
+ for(i = 0; (i < 1000) && micras_ttl_kt; i++)
+ msleep(1);
+ }
+
+ /*
+ * Tear down sysfs class and its nodes
+ */
+ class_unregister(&ras_class);
+
+#if defined(CONFIG_MK1OM) && USE_PM
+ /*
+ * De-register with the PM module.
+ */
+ pm_exit();
+#endif
+
+ /*
+ * Shut down the work queues
+ */
+#if defined(CONFIG_MK1OM) && WA_4845465
+ if (smc_4845465)
+ cancel_delayed_work(&micras_wq_smc);
+#endif
+ cancel_delayed_work(&micras_wq_tick);
+ cancel_delayed_work(&micras_wq_init);
+ flush_workqueue(micras_wq);
+ destroy_workqueue(micras_wq);
+
+ /*
+ * Restore MT state
+ */
+ mr_mt_exit();
+
+ /*
+ * Remove MMIO region maps
+ */
+ iounmap(micras_sbox);
+ for(i = 0; i < ARRAY_SIZE(micras_dbox); i++)
+ if (micras_dbox[i])
+ iounmap(micras_dbox[i]);
+ for(i = 0; i < ARRAY_SIZE(micras_gbox); i++)
+ if (micras_gbox[i])
+ iounmap(micras_gbox[i]);
+#ifdef CONFIG_MK1OM
+ for(i = 0; i < ARRAY_SIZE(micras_tbox); i++)
+ if (micras_tbox[i])
+ iounmap(micras_tbox[i]);
+#endif
+
+#if defined(CONFIG_MK1OM) && WA_4845465 && DIE_PROC
+ if (smc_4845465 && die_pe) {
+ remove_proc_entry("die", 0);
+ die_pe = 0;
+ }
+#endif
+
+ printk("RAS module unload completed\n");
+}
+
+module_init(micras_init);
+module_exit(micras_exit);
+
+MODULE_AUTHOR("Intel Corp. 2013 (" __DATE__ ") ver " RAS_VER);
+MODULE_DESCRIPTION("RAS and HW monitoring module for MIC");
+MODULE_LICENSE("GPL");
+
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * RAS PM interface
+ *
+ * Contains code to handle interaction with the PM driver.
+ * This includes the initial upload of core voltages and
+ * frequencies, handling of 'turbo' mode, and accounting
+ * for and reporting of card throttles.
+ * This really is for KnC only.
+ */
+
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/sysfs.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/bitmap.h>
+#include <linux/cpumask.h>
+#include <linux/io.h>
+#include <linux/cred.h>
+#include <asm/msr.h>
+#include <asm/mce.h>
+#include <asm/apic.h>
+#include <asm/mic/mic_common.h>
+#include <asm/mic/mic_knc/micsboxdefine.h>
+#include <scif.h>
+#include "micras.h"
+#include "monahan.h"
+#include <asm/mic/micpm_device.h>
+
+#if USE_PM
+
+static atomic_t pm_entry; /* Active calls from PM */
+
+
+/*
+ * Local variables to keep track of throttle states
+ *
+ * onoff Set to 1 if throttling is in effect, otherwise 0
+ * count Count of complete throttles (not counting current).
+ * time Time spent in complete throttles
+ * start Time when current throttle started (or 0)
+ *
+ * Units of time is measured in jiffies and converted to mSecs
+ * at the end of a throttle period. Jiffies are lower resolution
+ * than mSec. If a throttle starts and ends within same jiffy,
+ * a standard penalty of 1/2 jiffy gets added.
+ *
+ *TBD: perhaps it's better simply to add 1/2 jiffy to every throttle
+ * period to compensate for rounding down errors. Would be fair
+ * if average throttle period is more than 1 jiffy long.
+ *
+ *TBD: Using atomics may be overkill. Calls from the RAS MT thread
+ * will be serialized (guaranteed), i.e. the report routine needs
+ * not to care about re-entrancy.
+ */
+
+static atomic_t tmp_onoff;
+static atomic_t tmp_count;
+static atomic_long_t tmp_time;
+static atomic_long_t tmp_start;
+
+static atomic_t pwr_onoff;
+static atomic_t pwr_count;
+static atomic_long_t pwr_time;
+static atomic_long_t pwr_start;
+
+static atomic_t alrt_onoff;
+static atomic_t alrt_count;
+static atomic_long_t alrt_time;
+static atomic_long_t alrt_start;
+
+
+static void
+mr_pwr_enter(void)
+{
+ if (atomic_xchg(&pwr_onoff, 1))
+ return;
+
+ atomic_long_set(&pwr_start, jiffies);
+}
+
+static void
+mr_pwr_leave(void) {
+ unsigned long then;
+
+ if (! atomic_xchg(&pwr_onoff, 0))
+ return;
+
+ then = atomic_long_xchg(&pwr_start, 0);
+ atomic_inc(&pwr_count);
+
+ if (jiffies == then)
+ atomic_long_add(jiffies_to_msecs(1) / 2, &pwr_time);
+ else
+ atomic_long_add(jiffies_to_msecs(jiffies - then), &pwr_time);
+}
+
+
+static void
+mr_tmp_enter(void)
+{
+ if (atomic_xchg(&tmp_onoff, 1))
+ return;
+
+ atomic_long_set(&tmp_start, jiffies);
+}
+
+static void
+mr_tmp_leave(void)
+{
+ unsigned long then;
+
+ if (! atomic_xchg(&tmp_onoff, 0))
+ return;
+
+ then = atomic_long_xchg(&tmp_start, 0);
+ atomic_inc(&tmp_count);
+ if (jiffies == then)
+ atomic_long_add(jiffies_to_msecs(1) / 2, &tmp_time);
+ else
+ atomic_long_add(jiffies_to_msecs(jiffies - then), &tmp_time);
+}
+
+
+static void
+mr_alrt_enter(void)
+{
+ if (atomic_xchg(&alrt_onoff, 1))
+ return;
+
+ atomic_long_set(&alrt_start, jiffies);
+}
+
+static void
+mr_alrt_leave(void)
+{
+ unsigned long then;
+
+ if (! atomic_xchg(&alrt_onoff, 0))
+ return;
+
+ then = atomic_long_xchg(&alrt_start, 0);
+ atomic_inc(&alrt_count);
+ if (jiffies == then)
+ atomic_long_add(jiffies_to_msecs(1) / 2, &alrt_time);
+ else
+ atomic_long_add(jiffies_to_msecs(jiffies - then), &alrt_time);
+}
+
+
+
+/*
+ * Report current throttle state(s) to MT.
+ * Simple copy of local variables, except for the time
+ * measurement, where current throttle (if any) is included.
+ * Don't want a lock to gate access to the local variables,
+ * so the atomics needs to be read in the correct order.
+ * First throttle state, then adder if throttle is in
+ * progress, then counters. If PM enters or leave throttle
+ * while reading stats, the worst is that time for the
+ * current trottle is not included until next read.
+ */
+
+int
+mr_pm_ttl(struct mr_rsp_ttl * rsp)
+{
+ unsigned long then;
+
+ rsp->power.since = 0;
+ rsp->power.active = (uint8_t) atomic_read(&pwr_onoff);
+ if (rsp->power.active) {
+ then = atomic_long_read(&pwr_start);
+ if (then)
+ rsp->power.since = jiffies_to_msecs(jiffies - then);
+ }
+ rsp->power.count = atomic_read(&pwr_count);
+ rsp->power.time = atomic_long_read(&pwr_time);
+
+ rsp->thermal.since = 0;
+ rsp->thermal.active = (uint8_t) atomic_read(&tmp_onoff);
+ if (rsp->thermal.active) {
+ then = atomic_long_read(&tmp_start);
+ if (then)
+ rsp->thermal.since = jiffies_to_msecs(jiffies - then);
+ }
+ rsp->thermal.count = atomic_read(&tmp_count);
+ rsp->thermal.time = atomic_long_read(&tmp_time);
+
+ rsp->alert.since = 0;
+ rsp->alert.active = (uint8_t) atomic_read(&alrt_onoff);
+ if (rsp->alert.active) {
+ then = atomic_long_read(&alrt_start);
+ if (then)
+ rsp->alert.since = jiffies_to_msecs(jiffies - then);
+ }
+ rsp->alert.count = atomic_read(&alrt_count);
+ rsp->alert.time = atomic_long_read(&alrt_time);
+
+ return 0;
+}
+
+
+/*
+ * Throttle signaling function (call from PM)
+ */
+
+static int ttl_tcrit;
+
+void
+mr_throttle(int which, int state)
+{
+ struct ttl_info ttl;
+ uint32_t tmp;
+
+ atomic_inc(&pm_entry);
+
+ tmp = mr_sbox_rl(0, SBOX_THERMAL_STATUS_2);
+ ttl.die = GET_BITS(19, 10, tmp);
+
+ /*
+ * PM is weird in the destinction of thermal and power throttle.
+ * Power below PLIM should be quiet. Power between PLim1 and PLim0
+ * results in TTL_POWER events. Power above PLim0 results in both
+ * TTL_POWER and TTL_THERMAL events, _even_ if temperature is well
+ * below Tcrit. We handle this by maintaining 3 throttle related
+ * event types: thermal throttles, power throttles and power alert.
+ * The power alert is flaggend on entry as TTL_POWER, no problems.
+ * The two throttles both come in as TTL_THERMAL, so we use current
+ * die temperature to determine whether it was a thermal threshold
+ * or the power limit that was exceeded. Point is power throttles
+ * arriving while temperature is above Tcrit _will_ be counted as
+ * thermal throttles, period.
+ */
+ ttl.upd = 0;
+ switch(which) {
+ case TTL_POWER:
+ (state == TTL_OFF) ? mr_alrt_leave() : mr_alrt_enter();
+ ttl.upd |= PM_ALRT_TTL_CHG;
+ ttl.upd |= atomic_read(&alrt_onoff) ? PM_ALRT_TTL : 0;
+ break;
+
+ case TTL_THERMAL:
+#if 1
+ /*
+ * Careful here: may get throttle ON while die > tcrit
+ * and select thermal throttle correctly and then get
+ * the corresponding throttle OFF when die has fallen
+ * below tcrit in which case we must de-assert thermal
+ * trottle.
+ * As a shortcut, we deassert both throttles if the
+ * GPU_HOT signal gets de-asserted (which is correct).
+ */
+ if (state == TTL_OFF) {
+ if (atomic_read(&pwr_onoff))
+ ttl.upd |= PM_PWR_TTL_CHG;
+ if (atomic_read(&tmp_onoff))
+ ttl.upd |= PM_TRM_TTL_CHG;
+ mr_pwr_leave();
+ mr_tmp_leave();
+ }
+ else {
+ if (ttl_tcrit && ttl.die < ttl_tcrit) {
+ if (! atomic_read(&pwr_onoff))
+ ttl.upd |= (PM_PWR_TTL_CHG | PM_PWR_TTL);
+ mr_pwr_enter();
+ }
+ else {
+ if (! atomic_read(&tmp_onoff))
+ ttl.upd |= (PM_TRM_TTL_CHG | PM_TRM_TTL);
+ mr_tmp_enter();
+ }
+ }
+#else
+ if (ttl_tcrit && ttl.die < ttl_tcrit) {
+ (state == TTL_OFF) ? mr_pwr_leave() : mr_pwr_enter();
+ ttl.upd |= PM_PWR_TTL_CHG;
+ ttl.upd |= atomic_read(&pwr_onoff) ? PM_PWR_TTL : 0;
+ }
+ else {
+ (state == TTL_OFF) ? mr_tmp_leave() : mr_tmp_enter();
+ ttl.upd |= PM_TRM_TTL_CHG;
+ ttl.upd |= atomic_read(&tmp_onoff) ? PM_TRM_TTL : 0;
+ }
+#endif
+ break;
+ }
+
+ micras_ttl_send(&ttl);
+
+#if 0
+ printk("ttl - args: which %d, state %d\n", which, state);
+
+ printk("ttl - therm: on %d, count %d, time %ld, start %ld\n",
+ atomic_read(&tmp_onoff), atomic_read(&tmp_count),
+ atomic_long_read(&tmp_time), atomic_long_read(&tmp_start));
+
+ printk("ttl - power: on %d, count %d, time %ld, start %ld\n",
+ atomic_read(&pwr_onoff), atomic_read(&pwr_count),
+ atomic_long_read(&pwr_time), atomic_long_read(&pwr_start));
+
+ printk("ttl - alert: on %d, count %d, time %ld, start %ld\n",
+ atomic_read(&alrt_onoff), atomic_read(&alrt_count),
+ atomic_long_read(&alrt_time), atomic_long_read(&alrt_start));
+#endif
+
+ atomic_dec(&pm_entry);
+}
+
+
+/*
+ * Throttle signaling function (call from notifier chain)
+ *
+ * TBD: should we test for odd state transitions and recursions?
+ */
+
+static int
+mr_pm_throttle_callback(struct notifier_block *nb, unsigned long event, void *msg)
+{
+ atomic_inc(&pm_entry);
+
+ switch(event) {
+
+ case EVENT_PROCHOT_ON:
+ mr_throttle(TTL_THERMAL, TTL_ON);
+ break;
+
+ case EVENT_PROCHOT_OFF:
+ mr_throttle(TTL_THERMAL, TTL_OFF);
+ break;
+
+ case EVENT_PWR_ALERT_ON:
+ mr_throttle(TTL_POWER, TTL_ON);
+ break;
+
+ case EVENT_PWR_ALERT_OFF:
+ mr_throttle(TTL_POWER, TTL_OFF);
+ break;
+
+ default:
+ /*
+ * Ignore whatever else is sent this way
+ */
+ break;
+ }
+
+ atomic_dec(&pm_entry);
+ return 0;
+}
+
+
+
+
+/*
+**
+** Power management routines
+**
+** one_mmio_rd Read one MMIO register into memory safe
+** one_mmio_wr Write one MMIO register from memory safe
+**
+** one_msr_rd Read one MSR register into memory safe
+** one_msr_wr Write one MSR register from memory safe
+**
+** mc_suspend Prepare for suspend, preserve CSRs to safe
+** mc_suspend_cancel Suspend canceled, restore operating mode
+** mc_resume Recover from suspend, restore CSRs from safe
+**
+** For now this stores all registers that are used by this module.
+** In reality, only those registers on power planes turned off in
+** deep sleep states needs to be stored, but at this point it is
+** not known which registers are in that group. This is a table
+** driven mechanism that _only_ handles RAS related registers.
+**
+**TBD: Turn off MC handlers while in suspend?
+** Both pro's and con's on this one, such as
+** + Disabling uncore is easy, just clear INT_EN
+** + prevents MC to interfere with PM state transitions
+** - can hide corruption due to UC errors
+** - requires a lot of IPIs to shut down core MC handling
+** + there's nobody to handle MCs when cores are asleep.
+** ? can events hide in *BOX banks during suspend/resume
+** and fire when restoring the INT_EN register?
+** - Disabling core is not that easy (from a module).
+** Enabling core MCEs requires setting flag X86_CR4_MCE
+** in CR4 on every core _and_ writing ~0 to MSR IA32_MCG_CAP
+** on every CPU. Probably better to let per-CPU routines
+** like mce_suspend() and mce_resume() handle it, with
+** some care because we'd want to save all CTLs before
+** mce_suspend() runs and restore them after mce_resume().
+** Problem is how to get at these functions; they are not
+** exported and seems not to be hooked into the kernel's PM
+** call chains. Perhaps sysclass abstraction ties into PM.
+** Even so, who's to invoke it and how?
+*/
+
+#define SAVE_BLOCK_MCA 1 /* Disable MC handling in suspend */
+#define RAS_SAVE_MSR 1 /* Include global MSRs in suspend */
+#define RAS_SAVE_CPU_MSR 0 /* Include per-CPU MSRs in suspend */
+
+#define SBOX 1 /* SBOX register (index 0) */
+#define DBOX 2 /* DBOX register (index 0..1) */
+#define GBOX 3 /* GBOX register (index 0..7) */
+#define TBOX 4 /* TBOX register (index 0..7) */
+#define GMSR 5 /* Global MSR (index 0) */
+#define LMSR 6 /* Per-CPU MSR (index 0..CONFIG_NR_CPUS-1) */
+
+#define W64 (1 << 6) /* 64 bit MMIO register (32 bit default) */
+#define VLD (1 << 7) /* Register value valid, can be restored */
+
+typedef struct _regrec {
+ uint8_t box; /* Box type + width bit + valid bit */
+ uint8_t num; /* Box index (or 0) */
+ uint16_t ofs; /* MMIO byte offset / MSR number */
+ uint64_t reg; /* Register value */
+} RegRec;
+
+
+/*
+ * Rumor has it that SBOX CSRs below 0x7000 will survive deep sleep
+ * Think it's safer to save/restore CSRs that RAS writes to anyways.
+ * We'll leave out a bunch of RO CSRs, most of which are HW status.
+ * SCRATCH<n> CSRs are above 0x7000 and needs to be preserved.
+ *
+ *TBD: Somebody else to preserve scratch CSRs not used by RAS?
+ * For now I'll save and restore all of them.
+ */
+
+static RegRec susp_mmio[] = { /* Used in file */
+ { SBOX, 0, SBOX_MCA_INT_EN, 0 }, /* Uncore, must be 1st */
+ { SBOX, 0, SBOX_SCRATCH0, 0 }, /* - */
+ { SBOX, 0, SBOX_SCRATCH1, 0 }, /* - */
+ { SBOX, 0, SBOX_SCRATCH2, 0 }, /* - */
+ { SBOX, 0, SBOX_SCRATCH3, 0 }, /* - */
+ { SBOX, 0, SBOX_SCRATCH4, 0 }, /* Common, knc, */
+ { SBOX, 0, SBOX_SCRATCH5, 0 }, /* - */
+ { SBOX, 0, SBOX_SCRATCH6, 0 }, /* - */
+ { SBOX, 0, SBOX_SCRATCH7, 0 }, /* Knc, knf */
+ { SBOX, 0, SBOX_SCRATCH8, 0 }, /* - */
+ { SBOX, 0, SBOX_SCRATCH9, 0 }, /* Common, knc, knf */
+ { SBOX, 0, SBOX_SCRATCH10, 0 }, /* - */
+ { SBOX, 0, SBOX_SCRATCH11, 0 }, /* - */
+ { SBOX, 0, SBOX_SCRATCH12, 0 }, /* - */
+ { SBOX, 0, SBOX_SCRATCH13, 0 }, /* Common */
+ { SBOX, 0, SBOX_SCRATCH14, 0 }, /* - */
+ { SBOX, 0, SBOX_SCRATCH15, 0 }, /* - */
+// { SBOX, 0, SBOX_COMPONENT_ID, 0 }, /* Knc */
+// { SBOX, 0, SBOX_SVIDCONTROL, 0 }, /* Knc */
+// { SBOX, 0, SBOX_PCIE_PCI_SUBSYSTEM, 0 }, /* Common */
+// { SBOX, 0, SBOX_PCIE_VENDOR_ID_DEVICE_ID, 0 }, /* Common */
+// { SBOX, 0, SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8, 0 },/* Common */
+ { SBOX, 0, SBOX_OC_I2C_ICR + ICR_OFFSET, 0 }, /* Elog */
+ { SBOX, 0, SBOX_OC_I2C_ICR + ISR_OFFSET, 0 }, /* Elog */
+ { SBOX, 0, SBOX_OC_I2C_ICR + ISAR_OFFSET, 0 }, /* Elog */
+ { SBOX, 0, SBOX_OC_I2C_ICR + IDBR_OFFSET, 0 }, /* Elog */
+// { SBOX, 0, SBOX_OC_I2C_ICR + IBMR_OFFSET, 0 }, /* Elog */
+// { SBOX, 0, SBOX_COREVOLT, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_COREFREQ, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_MEMVOLT, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_MEMORYFREQ, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_CURRENTRATIO, 0 }, /* Knc */
+// { SBOX, 0, SBOX_BOARD_VOLTAGE_SENSE, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_THERMAL_STATUS, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_BOARD_TEMP1, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_BOARD_TEMP2, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_CURRENT_DIE_TEMP0, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_CURRENT_DIE_TEMP1, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_CURRENT_DIE_TEMP2, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_MAX_DIE_TEMP0, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_MAX_DIE_TEMP1, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_MAX_DIE_TEMP2, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_STATUS_FAN1, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_STATUS_FAN2, 0 }, /* Knc, knf */
+// { SBOX, 0, SBOX_SPEED_OVERRIDE_FAN, 0 }, /* Knc, knf */
+ { SBOX, 0, SBOX_MCA_INT_STAT, 0 }, /* Uncore */
+// { SBOX, 0, SBOX_APICRT16, 0 }, /* Uncore */
+ { SBOX, 0, SBOX_MCX_CTL_LO, 0 }, /* Uncore */
+ { DBOX, 0, DBOX_MC2_CTL, 0 }, /* Uncore */
+#ifdef CONFIG_MK1OM
+ { DBOX, 1, DBOX_MC2_CTL, 0 }, /* Uncore */
+#endif
+ { GBOX | W64, 0, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
+ { GBOX | W64, 1, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
+ { GBOX | W64, 2, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
+ { GBOX | W64, 3, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
+#ifdef CONFIG_MK1OM
+ { GBOX | W64, 4, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
+ { GBOX | W64, 5, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
+ { GBOX | W64, 6, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
+ { GBOX | W64, 7, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
+#endif
+#ifdef CONFIG_MK1OM
+ { TBOX | W64, 0, TXS_MCX_CONTROL, 0 }, /* Uncore */
+ { TBOX | W64, 1, TXS_MCX_CONTROL, 0 }, /* Uncore */
+ { TBOX | W64, 2, TXS_MCX_CONTROL, 0 }, /* Uncore */
+ { TBOX | W64, 3, TXS_MCX_CONTROL, 0 }, /* Uncore */
+ { TBOX | W64, 4, TXS_MCX_CONTROL, 0 }, /* Uncore */
+ { TBOX | W64, 5, TXS_MCX_CONTROL, 0 }, /* Uncore */
+ { TBOX | W64, 6, TXS_MCX_CONTROL, 0 }, /* Uncore */
+ { TBOX | W64, 7, TXS_MCX_CONTROL, 0 }, /* Uncore */
+#endif
+};
+
+#if RAS_SAVE_MSR
+static RegRec susp_msr[] = { /* Used in file */
+ { GMSR, 0, MSR_IA32_MCG_STATUS, 0 }, /* Uncore, kernel */
+};
+
+#if RAS_SAVE_CPU_MSR
+static RegRec susp_lcl_msr[4 * CONFIG_NR_CPUS] = { /* Used in file */
+ { LMSR, 0, MSR_IA32_MCx_CTL(0), 0 }, /* Core, kernel */
+ { LMSR, 0, MSR_IA32_MCx_CTL(1), 0 }, /* Core, kernel */
+ { LMSR, 0, MSR_IA32_MCx_CTL(2), 0 }, /* Core, kernel */
+ { LMSR, 0, MSR_IA32_MCG_CTL, 0 }, /* kernel */
+ /*
+ * The remaining entries is setup/replicated by pm_init()
+ */
+};
+#endif
+#endif
+
+
+static void
+one_mmio_rd(RegRec * r)
+{
+ switch(r->box & 0xf) {
+ case SBOX:
+ if (r->box & W64)
+ r->reg = mr_sbox_rq(0, r->ofs);
+ else
+ r->reg = (uint64_t) mr_sbox_rl(0, r->ofs);
+ break;
+ case DBOX:
+ if (r->box & W64)
+ r->reg = mr_dbox_rq(r->num, r->ofs);
+ else
+ r->reg = (uint64_t) mr_dbox_rl(r->num, r->ofs);
+ break;
+ case GBOX:
+ if (r->box & W64)
+ r->reg = mr_gbox_rq(r->num, r->ofs);
+ else
+ r->reg = (uint64_t) mr_gbox_rl(r->num, r->ofs);
+ break;
+ case TBOX:
+ if (mr_txs()) {
+ if (r->box & W64)
+ r->reg = mr_tbox_rq(r->num, r->ofs);
+ else
+ r->reg = (uint64_t) mr_tbox_rl(r->num, r->ofs);
+ }
+ break;
+ default:
+ r->box &= ~VLD;
+ return;
+ }
+ r->box |= VLD;
+
+#if PM_VERBOSE
+ printk("mmio_rd: box %d, idx %3d, ofs %04x -> %llx\n",
+ r->box & 0xf, r->num, r->ofs, r->reg);
+#endif
+}
+
+static void
+one_mmio_wr(RegRec * r)
+{
+ if (! (r->box & VLD))
+ return;
+
+ switch(r->box & 0xf) {
+ case SBOX:
+ if (r->box & W64)
+ mr_sbox_wq(0, r->ofs, r->reg);
+ else
+ mr_sbox_wl(0, r->ofs, (uint32_t) r->reg);
+ break;
+ case DBOX:
+ if (r->box & W64)
+ mr_dbox_wq(r->num, r->ofs, r->reg);
+ else
+ mr_dbox_wl(r->num, r->ofs, (uint32_t) r->reg);
+ break;
+ case GBOX:
+ if (r->box & W64)
+ mr_gbox_wq(r->num, r->ofs, r->reg);
+ else
+ mr_gbox_wl(r->num, r->ofs, (uint32_t) r->reg);
+ break;
+ case TBOX:
+ if (mr_txs()) {
+ if (r->box & W64)
+ mr_tbox_wq(r->num, r->ofs, r->reg);
+ else
+ mr_tbox_wl(r->num, r->ofs, (uint32_t) r->reg);
+ }
+ break;
+ }
+ r->box &= ~VLD;
+
+#if PM_VERBOSE
+ printk("mmio_wr: box %d, idx %3d, ofs %04x <- %llx\n",
+ r->box & 0xf, r->num, r->ofs, r->reg);
+#endif
+}
+
+
+#if RAS_SAVE_MSR
+static void
+one_msr_rd(RegRec * r)
+{
+ uint32_t hi, lo;
+
+ switch(r->box & 0xf) {
+ case GMSR:
+ rdmsr(r->ofs, lo, hi);
+ break;
+#if RAS_SAVE_CPU_MSR
+ case LMSR:
+ rdmsr_on_cpu(r->num, r->ofs, &lo, &hi);
+ break;
+#endif
+ default:
+ r->box &= ~VLD;
+ return;
+ }
+ r->reg = ((uint64_t) hi) << 32 | (uint64_t) lo;
+ r->box |= VLD;
+
+#if PM_VERBOSE
+ printk("msr_rd: box %d, idx %3d, ofs %04x -> %llx\n",
+ r->box & 0xf, r->num, r->ofs, r->reg);
+#endif
+}
+
+static void
+one_msr_wr(RegRec * r)
+{
+ uint32_t hi, lo;
+
+ if (! (r->box & VLD))
+ return;
+
+ hi = r->reg >> 32;
+ lo = r->reg & 0xffffffff;
+ switch(r->box & 0xf) {
+ case GMSR:
+ wrmsr(r->ofs, lo, hi);
+ break;
+#if RAS_SAVE_CPU_MSR
+ case LMSR:
+ wrmsr_on_cpu(r->num, r->ofs, lo, hi);
+ break;
+#endif
+ }
+ r->box &= ~VLD;
+
+#if PM_VERBOSE
+ printk("msr_wr: box %d, idx %3d, ofs %04x <- %llx\n",
+ r->box & 0xf, r->num, r->ofs, r->reg);
+#endif
+}
+#endif /* RAS_SAVE_MSR */
+
+
+/*
+ * Preserve all HW registers that will be lost in
+ * deep sleep states. This will be SBOX registers
+ * above offset 0x7000 and all other BOX registers.
+ */
+
+static void
+mr_suspend(void)
+{
+ int i;
+
+ atomic_inc(&pm_entry);
+
+ /*
+ * Save SBOX_MCA_INT_EN first and clear it.
+ * No more uncore MCAs will get through.
+ */
+ one_mmio_rd(susp_mmio + 0);
+#if SAVE_BLOCK_MCA
+ mr_sbox_wl(0, SBOX_MCA_INT_EN, 0);
+#endif
+
+ /*
+ * Save remaining BOX MMIOs
+ */
+ for(i = 1; i < ARRAY_SIZE(susp_mmio); i++)
+ one_mmio_rd(susp_mmio + i);
+
+#if RAS_SAVE_MSR
+ /*
+ * Save global MSRs and set MCIP
+ * No new exceptions will be asserted
+ */
+ for(i = 0; i < ARRAY_SIZE(susp_msr); i++)
+ one_msr_rd(susp_msr + i);
+#if SAVE_BLOCK_MCA
+ wrmsr(MSR_IA32_MCG_STATUS, MCG_STATUS_MCIP, 0);
+#endif
+
+#if RAS_SAVE_CPU_MSR
+ /*
+ * Save per-CPU MSRs
+ */
+ for(i = 0; i < ARRAY_SIZE(susp_lcl_msr); i++)
+ one_msr_rd(susp_lcl_msr + i);
+#endif
+#endif
+
+ atomic_dec(&pm_entry);
+}
+
+
+/*
+ * Undo side effects of a suspend call.
+ * Nothing to do unless we turned MC handlers off.
+ */
+
+static void
+mr_cancel(void)
+{
+ int i;
+
+ atomic_inc(&pm_entry);
+
+ /*
+ * Restore SBOX_MCA_INT_EN to unblock uncore MCs
+ * Invalidate all other saved MMIO registers.
+ */
+ one_mmio_wr(susp_mmio + 0);
+ for(i = 1; i < ARRAY_SIZE(susp_mmio); i++)
+ susp_mmio[i].box &= ~VLD;
+
+#if RAS_SAVE_MSR
+ /*
+ * Restore IA32_MCG_STATUS to unblock core MCs
+ * Invalidate all other saved MSR registers.
+ */
+ one_msr_wr(susp_msr + 0);
+ for(i = 1; i < ARRAY_SIZE(susp_msr); i++)
+ susp_msr[i].box &= ~VLD;
+
+#if RAS_SAVE_CPU_MSR
+ for(i = 0; i < ARRAY_SIZE(susp_lcl_msr); i++)
+ susp_lcl_msr[i].box &= ~VLD;
+#endif
+#endif
+
+ atomic_dec(&pm_entry);
+}
+
+
+/*
+ * Restore all HW registers that we use.
+ */
+
+static void
+mr_resume(void)
+{
+ int i;
+
+ atomic_inc(&pm_entry);
+
+ /*
+ * Clear uncore MCA banks (just in case)
+ */
+ if (susp_mmio[0].box & VLD)
+ box_reset(0);
+
+ /*
+ * Restore all BOX MMIOs but SBOX_MCA_INT_EN
+ */
+ for(i = 1; i < ARRAY_SIZE(susp_mmio); i++)
+ one_mmio_wr(susp_mmio + i);
+
+ /*
+ * Then restore SBOX_MCA_INT_EN to enable uncore MCAs
+ */
+ one_mmio_wr(susp_mmio + 0);
+
+#if RAS_SAVE_MSR
+ /*
+ * Restore all global MSRs but IA32_MCG_STATUS
+ */
+ for(i = 1; i < ARRAY_SIZE(susp_msr); i++)
+ one_msr_wr(susp_msr + i);
+
+ /*
+ * Then restore IA32_MCG_STATUS to allow core MCAs
+ */
+ one_msr_wr(susp_msr + 0);
+
+#if RAS_SAVE_CPU_MSR
+ /*
+ * Restore all per-cpu MSRs
+ */
+ for(i = 0; i < ARRAY_SIZE(susp_lcl_msr); i++)
+ one_msr_wr(susp_lcl_msr + i);
+#endif
+#endif
+
+ atomic_dec(&pm_entry);
+}
+
+
+/*
+ * Callback from PM notifier chain.
+ * TBD: should we test for odd state transitions and recursions?
+ */
+
+static int
+mr_pm_callback(struct notifier_block *nb, unsigned long event, void *msg)
+{
+
+ switch(event) {
+ case MICPM_DEVEVENT_SUSPEND:
+ mr_suspend();
+ break;
+
+ case MICPM_DEVEVENT_RESUME:
+ mr_resume();
+ break;
+
+ case MICPM_DEVEVENT_FAIL_SUSPEND:
+ mr_cancel();
+ break;
+
+ default:
+ /*
+ * Ignore whatever else is sent this way
+ */
+ break;
+ }
+
+ return 0;
+}
+
+
+
+/*
+**
+** The PM module loads before RAS, so we must setup
+** the API to support power management, i.e register.
+** PM needs:
+** - Notification when MT changes certain variables.
+** Provided by a call-out list that the PM sets
+** at registration time.
+** - Access to MT calls.
+** The PM module can use micras_mt_call() for access.
+** Since PM loads first, this function needs to
+** be passed at registration time.
+** RAS needs:
+** - list of core voltages (for CVOLT query).
+** We pass a pointer to the voltage list and the
+** voltage list counter to PM module, who will
+** fill in the actual values (not available until
+** core-freq driver loads).
+** - list of core frequencies (for CFREQ query).
+** Same solution as for CVOLT.
+** - Notifications for throttle state changes.
+** - Power management notifications for suspend/resume.
+**
+** Note: can one notifier block be inserted in multiple
+** chains? Its assume not, which require two blocks
+** both pointing to the same local function.
+*/
+
+extern struct mr_rsp_freq freq;
+extern struct mr_rsp_volt volt;
+
+struct micpm_params pm_reg; /* Our data for PM */
+struct micpm_callbacks pm_cb; /* PM data for us */
+
+extern void micpm_device_register(struct notifier_block *n);
+extern void micpm_device_unregister(struct notifier_block *n);
+extern void micpm_atomic_notifier_register(struct notifier_block *n);
+extern void micpm_atomic_notifier_unregister(struct notifier_block *n);
+
+static struct notifier_block ras_deviceevent = {
+ .notifier_call = mr_pm_callback,
+};
+
+static struct notifier_block ras_throttle_event_ns = {
+ .notifier_call = mr_pm_throttle_callback,
+};
+
+static struct notifier_block ras_throttle_event = {
+ .notifier_call = mr_pm_throttle_callback,
+};
+
+
+/*
+ * Setup PM callbacks and SCIF handler.
+ */
+
+static int
+pm_mt_call(uint16_t cmd, void * buf)
+{
+ int err;
+
+ atomic_inc(&pm_entry);
+ err = micras_mt_call(cmd, buf);
+ atomic_dec(&pm_entry);
+
+ return err;
+}
+
+
+int __init
+pm_init(void)
+{
+ extern int mr_smc_rd(uint8_t, uint32_t *);
+
+#if RAS_SAVE_CPU_MSR
+ /*
+ * Preset MCA bank MSR register descriptions
+ *
+ *TBD: We have to use IPIs to read MSRs, which will wake
+ * up cores at sleep when this function is called.
+ * PM module may not like this at all.
+ */
+ int i, j;
+ for(i = 1; i < nr_cpu_ids; i++) {
+ j = 4 * i;
+ susp_lcl_msr[j] = susp_lcl_msr[0];
+ susp_lcl_msr[j + 1] = susp_lcl_msr[1];
+ susp_lcl_msr[j + 2] = susp_lcl_msr[2];
+ susp_lcl_msr[j + 3] = susp_lcl_msr[3];
+ susp_lcl_msr[j].num = i;
+ susp_lcl_msr[j + 1].num = i;
+ susp_lcl_msr[j + 2].num = i;
+ susp_lcl_msr[j + 3].num = i;
+ }
+#endif
+
+ /*
+ * Get temperature where power throttle becomes thermal throttle
+ */
+ mr_smc_rd(0x4c, &ttl_tcrit);
+
+ /*
+ * Register with the MIC Power Management driver.
+ */
+ pm_reg.volt_lst = volt.supt;
+ pm_reg.volt_len = &volt.slen;
+ pm_reg.volt_siz = ARRAY_SIZE(volt.supt);
+ pm_reg.freq_lst = freq.supt;
+ pm_reg.freq_len = &freq.slen;
+ pm_reg.freq_siz = ARRAY_SIZE(freq.supt);
+ pm_reg.mt_call = pm_mt_call;
+ pm_reg.mt_ttl = mr_throttle;
+ if (micpm_ras_register(&pm_cb, &pm_reg))
+ goto fail_pm;
+
+ /*
+ * Get into the PM notifier lists
+ * MicPm reports events in 2 chains, one atomic and one
+ * blocking. Our callback will not block!
+ */
+ micpm_atomic_notifier_register(&ras_throttle_event_ns);
+ micpm_notifier_register(&ras_throttle_event);
+
+ if (boot_cpu_data.x86_mask == KNC_C_STEP)
+ micpm_device_register(&ras_deviceevent);
+
+ printk("RAS.pm: init complete\n");
+ return 0;
+
+fail_pm:
+ printk("RAS.pm: init failed\n");
+ return 1;
+}
+
+
+/*
+ * Cleanup for module unload.
+ * Clear/restore hooks in the native MCA handler.
+ */
+
+void __exit
+pm_exit(void)
+{
+ /*
+ * Get off the PM notifier list
+ */
+ micpm_atomic_notifier_unregister(&ras_throttle_event_ns);
+ micpm_notifier_unregister(&ras_throttle_event);
+
+ if (boot_cpu_data.x86_mask == KNC_C_STEP)
+ micpm_device_unregister(&ras_deviceevent);
+
+ /*
+ * De-register with the PM module.
+ */
+ micpm_ras_unregister();
+
+ /*
+ * Wait for an calls to module to finish.
+ */
+ while(atomic_read(&pm_entry))
+ cpu_relax();
+
+ printk("RAS.pm: exit complete\n");
+}
+
+#endif /* USE_PM */
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * RAS handler for uncore MC events
+ *
+ * Contains code to intercept MC events, collect information
+ * from uncore MCA banks and handle the situation.
+ *
+ * In case of a severe event, defined by corrupted context,
+ * the handler will add a record of the event in the designated
+ * EEPROM hanging off the Over Clocking I2C bus. After that
+ * a message will be sent to the SMC (enabling IPMI notifications)
+ * and at last a message is sent to the host via the MC SCIF
+ * connection.
+ *
+ * Lesser events will also be sent to the host on a 'FYI' basis,
+ * but no rocord will be stored in the event log.
+ *
+ * This is in all aspects similar to the reaction to a severe
+ * core MC event. Differences are in the MC bank access (mmio),
+ * and that the event is delivered via an interrupt instead of
+ * an exception. Still, the handler cannot expect any support
+ * from the OS.
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/nmi.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+#include <asm/mic/mic_common.h>
+#include <asm/mic/mic_knc/autobaseaddress.h>
+#include <asm/mic/mic_knc/micsboxdefine.h>
+#include "micras.h"
+
+
+/*
+ * Hooks placed in the native machine check handler
+ * See file arch/x86/kernel/traps.c for placement
+ *
+ * nmi Entered NMI exception handler.
+ * Called before any other tests, which allow us
+ * to test for and handle un-core MCA events before
+ * the traditional NMI handling.
+ * Note that the mce-inject mechanism also uses
+ * NMI's to distribute calls to do_machine_check().
+ */
+
+extern int (*mca_nmi)(int);
+
+
+
+/*
+ * Table of un-core MCA banks.
+ * Though there are differences in register count and sizes, un-core bank
+ * registers are always spaced 8 bytes apart, so all we need to know is
+ * the location of the first MCA bank register (CTL) to find them.
+ * If bank is present, the bank register offsets for ctl, status, addr,
+ * and misc are thus 0, 8, 16, and 24 respectively.
+ * Default CTL masks pulled from the register documentation
+ * Some SKUs don't have support for all BOXs but that will be handled
+ * at runtime in the support code, not at compile time by this table.
+ */
+
+
+#ifdef CONFIG_ML1OM
+#define SBOX_DEF 0x000e /* All (7) */
+#define DBOX_DEF 0x0003 /* All (2) */
+#define GBOX_DEF 0x0003 /* All (2) */
+#endif
+#ifdef CONFIG_MK1OM
+#define SBOX_DEF 0x03ce /* All - PCIe errors (7) */
+#define DBOX_DEF 0x000f /* All (4) */
+#define GBOX_DEF 0x3ffffffff /* All (34) */
+#define TBOX_DEF 0x001f /* All (5) */
+#endif
+
+#define MCU_CTL_64 (1 << 0) /* Bank has 64 bit CTL register */
+#define MCU_NO_ADDR (1 << 1) /* Bank has no ADDR register */
+#define MCU_ADDR_32 (1 << 2) /* Bank has 32 bit ADDR register */
+#define MCU_NO_MISC (1 << 3) /* Bank has no MISC register */
+#define MCU_MISC_64 (1 << 4) /* Bank has 64 bit MISC register */
+
+#define MCU_CTRL 0
+#define MCU_STAT 8
+#define MCU_ADDR 16
+#define MCU_MISC 24
+
+typedef struct _mcu_rec {
+ uint8_t num; /* 'BOX' count */
+ uint8_t org; /* Origin code */
+ uint8_t qflg; /* Quirk flags */
+ uint16_t ofs; /* MCA bank base offset */
+ uint64_t ctl; /* Initial CTL mask */
+ uint32_t (*rl)(int, uint32_t); /* 32-bit MMIO read */
+ void (*wl)(int, uint32_t, uint32_t); /* 32-bit MMIO write */
+ uint64_t (*rq)(int, uint32_t); /* 64-bit MMIO read */
+ void (*wq)(int, uint32_t, uint64_t); /* 64-bit MMIO write */
+} McuRec;
+
+
+static McuRec mcu_src[] = {
+ { 1, MC_ORG_SBOX, MCU_MISC_64, SBOX_MCX_CTL_LO,
+ SBOX_DEF, mr_sbox_rl, mr_sbox_wl, mr_sbox_rq, mr_sbox_wq },
+ { DBOX_NUM, MC_ORG_DBOX, MCU_NO_MISC, DBOX_MC2_CTL,
+ DBOX_DEF, mr_dbox_rl, mr_dbox_wl, mr_dbox_rq, mr_dbox_wq },
+ { GBOX_NUM, MC_ORG_GBOX, MCU_CTL_64, GBOX_FBOX_MCA_CTL_LO,
+ GBOX_DEF, mr_gbox_rl, mr_gbox_wl, mr_gbox_rq, mr_gbox_wq },
+#ifdef CONFIG_MK1OM
+ { TBOX_NUM, MC_ORG_TBOX, MCU_CTL_64 | MCU_NO_MISC | MCU_ADDR_32, TXS_MCX_CONTROL,
+ TBOX_DEF, mr_tbox_rl, mr_tbox_wl, mr_tbox_rq, mr_tbox_wq },
+#endif
+};
+
+#define GBOX_BROKEN 1 /* Set if GBOX MCA bank is borken */
+
+#if GBOX_BROKEN
+/*
+ * Si design managed to break the GBOX MCA bank concept
+ * by not filling useful data into ADDR and MISC registers.
+ * Instead they use a bunch of registers in another part
+ * of the GBOX (mbox to be specific) to hold this info.
+ * In order to get at the right register it is necesary
+ * to partially decode the STATUS register and from there
+ * select an GBOX.MBOX register.
+ * Since the new registers are all 32 bits wide, we'll stick
+ * the value into MISC register if Misc_V bit of STATUS is
+ * not set. The following table is used for register selection
+ *
+ * model code base width Chan Notes
+ * 0 017c 32 0 26 bit address, CRC (retrain)
+ * 1 097c 32 1 26 bit address, CRC (retrain)
+ * 2 01e0 32 0 26 bit address, ECC
+ * 3 09e0 32 1 26 bit address, ECC
+ * 4 01dc 32 0 26 bit address, UC CAPE
+ * 5 09dc 32 1 26 bit address, UC CAPE
+ * 31 01a4 32 0 26 bit address, UC ECC
+ * 32 09a4 32 1 26 bit address, UC ECC
+ *
+ * Note: model code is simply the enable bit number in CTL
+ */
+
+static struct liu {
+ uint16_t mcode;
+ uint16_t base;
+} liu[] = {
+ { 0, 0x17c }, /* Correctable CRC (retrain) ch 0 */
+ { 1, 0x97c }, /* Correctable CRC (retrain) ch 1 */
+ { 2, 0x1e0 }, /* Correctable ECC, ch 0 */
+ { 3, 0x9e0 }, /* Correctable ECC, ch 1 */
+ { 4, 0x1dc }, /* Uncorrectable CAPE, ch 0 */
+ { 5, 0x9dc }, /* Uncorrectable CAPE, ch 1 */
+ { 31, 0x1a4 }, /* Uncorrectable ECC, ch 0 */
+ { 32, 0x9a4 } /* Uncorrectable ECC, ch 1 */
+};
+
+static void
+mcu_gbox_fixup(McuRec * mr, int num, MceInfo * mi)
+{
+ int i;
+ uint16_t mcode;
+
+ /*
+ * Skip if Status.Misc_v set
+ */
+ if (mi->status & (1ULL << 59))
+ return;
+
+ /*
+ * Get model code and if it's in the array, then read
+ * the addressed register into MISC. We don't set the
+ * Status.Misc_v bit because we want to distinguish
+ * this hack from the real MCA bank register.
+ */
+ mcode = GET_BITS(31, 16, mi->status);
+ for(i = 0; i < ARRAY_SIZE(liu); i++)
+ if (liu[i].mcode == mcode) {
+ mi->misc = (uint64_t) mr->rl(num, liu[i].base);
+ break;
+ }
+}
+#endif
+
+/*
+ * Read Ctrl, Addr and Misc registers from an un-core MCA bank.
+ * The Status register is read/cleared in mcu_scan().
+ */
+
+static void
+mcu_read(McuRec * mr, int num, MceInfo * mi)
+{
+ if (mr->qflg & MCU_CTL_64)
+ mi->ctl = mr->rq(num, mr->ofs + MCU_CTRL);
+ else
+ mi->ctl = (uint64_t) mr->rl(num, mr->ofs + MCU_CTRL);
+
+ if (mr->qflg & MCU_NO_ADDR)
+ mi->addr = 0;
+ else {
+ if (mr->qflg & MCU_ADDR_32)
+ mi->addr = (uint64_t) mr->rl(num, mr->ofs + MCU_ADDR);
+ else
+ mi->addr = mr->rq(num, mr->ofs + MCU_ADDR);
+ }
+
+ if (mr->qflg & MCU_NO_MISC)
+ mi->misc = 0;
+ else {
+ if (mr->qflg & MCU_MISC_64)
+ mi->misc = mr->rq(num, mr->ofs + MCU_MISC);
+ else
+ mi->misc = (uint64_t) mr->rl(num, mr->ofs + MCU_MISC);
+ }
+
+#if GBOX_BROKEN
+ if (mr->org == MC_ORG_GBOX)
+ mcu_gbox_fixup(mr, num, mi);
+#endif
+}
+
+
+/*
+ * Reset one un-core MCA bank
+ * Any quirks go here.
+ */
+
+static void
+mcu_reset(McuRec * mr, int num, int arm)
+{
+ uint64_t ctl;
+
+ mr->wq(num, mr->ofs + MCU_STAT, 0);
+
+ if (! (mr->qflg & MCU_NO_ADDR)) {
+ if (mr->qflg & MCU_ADDR_32)
+ mr->wl(num, mr->ofs + MCU_ADDR, 0);
+ else
+ mr->wq(num, mr->ofs + MCU_ADDR, 0);
+ }
+
+ if (! (mr->qflg & MCU_NO_MISC)) {
+ if (mr->qflg & MCU_MISC_64)
+ mr->wq(num, mr->ofs + MCU_MISC, 0);
+ else
+ mr->wl(num, mr->ofs + MCU_MISC, 0);
+ }
+
+ ctl = arm ? mr->ctl : 0;
+
+#ifdef CONFIG_MK1OM
+ if (ctl && mr->org == MC_ORG_SBOX && mic_hw_stepping(0) == KNC_A_STEP)
+ ctl &= ~PUT_BIT(3, 1); /* A0 SBOX 'unclaimed address' bug */
+
+ if (ctl && mr->org == MC_ORG_GBOX && mr_mch() != 16)
+ ctl &= ~(uint64_t) PUT_BIT(6, 1); /* B0 GBOX 'Invalid Channel' (SKU 3 & 4) */
+#endif
+
+ if (mr->qflg & MCU_CTL_64)
+ mr->wq(num, mr->ofs + MCU_CTRL, ctl);
+ else
+ mr->wl(num, mr->ofs + MCU_CTRL, ctl);
+}
+
+
+/*
+ * Un-core MC bank pre-scan
+ * Walk through all un-core MC sources to see if any events are pending.
+ * Stops on 1st match where STATUS has both VAL bit set. On some BOXes,
+ * like GBOX, interrupt may be signalled without the EN bit being set.
+ * See HSD 4116374 for details.
+ */
+
+static int
+mcu_prescan(void)
+{
+ int i, j;
+ uint64_t status;
+ struct _mcu_rec * mr;
+
+ for(i = 0; i < ARRAY_SIZE(mcu_src); i++) {
+ mr = mcu_src + i;
+
+#ifdef CONFIG_MK1OM
+ if (mr->org == MC_ORG_TBOX && !mr_txs())
+ continue;
+#endif
+
+ for(j = 0; j < mr->num; j++) {
+ status = mr->rq(j, mr->ofs + MCU_STAT);
+ if (status & MCI_STATUS_VAL)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+
+/*
+ * Un-core MC bank scanner.
+ * Walks through all un-core MC sources for new events.
+ * If any found, then process them same way as core events.
+ */
+
+static int
+mcu_scan(void)
+{
+ MceInfo mc, uc;
+ int gone, seen;
+ int i, j;
+ struct _mcu_rec * mr;
+
+ /*
+ * Walk list of known un-core MC sources
+ */
+ gone = seen = 0;
+ memset(&uc, 0, sizeof(uc));
+ for(i = 0; i < ARRAY_SIZE(mcu_src); i++) {
+ mr = mcu_src + i;
+
+#ifdef CONFIG_MK1OM
+ if (mr->org == MC_ORG_TBOX && !mr_txs())
+ continue;
+#endif
+
+ for(j = 0; j < mr->num; j++) {
+
+ /*
+ * Read status to see if we have something of interest.
+ * As per HSD 4116374 the status register is cleared
+ * after read, if it had valid content.
+ *TBD: Clear unconditionally?
+ */
+ mc.status = mr->rq(j, mr->ofs + MCU_STAT);
+ if (mc.status & MCI_STATUS_VAL)
+ mr->wq(j, mr->ofs + MCU_STAT, 0);
+ else
+ continue;
+
+ /*
+ * Bank had valid content (VAL bit set).
+ * Verify the event was subscribed to (EN bit set).
+ * If not, the event is ignored.
+ */
+ if (! (mc.status & MCI_STATUS_EN))
+ continue;
+
+ /*
+ * Valid and enabled event, read remaining bank registers.
+ */
+ seen++;
+ mcu_read(mr, j, &mc);
+
+ /*
+ * Fill out blanks in the MceInfo record
+ */
+ mc.org = mr->org;
+ mc.id = j;
+ mc.stamp = get_seconds();
+ mc.flags = (mc.status & MCI_STATUS_UC) ? MC_FLG_FATAL : 0;
+
+ /*
+ * If any way to detect injected errors then this is
+ * the place to do so and indicate by MC_FLG_FALSE flag
+ */
+
+ if (mc.flags & MC_FLG_FATAL) {
+#ifdef CONFIG_MK1OM
+#if MC_VERBOSE
+ ee_printk("Uncore fatal MC: org %d, id %d, status %lx\n", mc.org, mc.id, mc.status);
+#endif
+
+ /*
+ * Log UC events in the eeprom.
+ */
+ micras_mc_log(&mc);
+ mc.flags |= MC_FLG_LOG;
+
+ /*
+ * Notify SMC that we've had a serious machine check error.
+ */
+ micras_mc_ipmi(&mc, 1);
+#endif
+ /*
+ * Remember 1st fatal (UC) event
+ */
+ if (! gone++)
+ uc = mc;
+ }
+
+ /*
+ * Notify host
+ */
+ micras_mc_send(&mc, 1);
+
+ /*
+ * Filter corrected errors.
+ */
+ if (! (mc.flags & MC_FLG_FATAL)) {
+ uint64_t tsc, msk;
+
+ tsc = rdtsc();
+ msk = micras_mc_filter(&mc, tsc, 1);
+ if (msk) {
+#if MC_VERBOSE
+ ee_printk("Uncore filter: org %d, id %d, ctrl %lx, mask %lx\n", mc.org, mc.id, mc.ctl, msk);
+#endif
+ if (mr->qflg & MCU_CTL_64)
+ mr->wq(j, mr->ofs + MCU_CTRL, mc.ctl & ~msk);
+ else
+ mr->wl(j, mr->ofs + MCU_CTRL, (uint32_t)(mc.ctl & ~msk));
+ }
+ }
+
+ /*
+ * Any event post processing goes here.
+ * This would be things like cache line refresh and such.
+ * Actual algorithms are TBD.
+ */
+ }
+ }
+
+#if RAS_HALT
+ if (gone) {
+ atomic_inc(&mce_entry);
+ panic("FATAL un-core machine check event:\n"
+ "bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n",
+ uc.org, uc.id, uc.ctl, uc.status, uc.addr, uc.misc);
+ }
+#endif
+
+ return seen;
+}
+
+
+/*
+ * NMI handler.
+ *
+ * Once we get control in 1st interrupt (NMI or regular), we'll
+ * use IPIs from the local APIC to force all active CPU's into
+ * our RAS NMI handler, similar to the core MC handler.
+ * After that, the same logic as for the generic MC handler is
+ * applied to corral all CPU's through well defined rendez-vous
+ * points where only one cpu gets to run the un-core MC event
+ * scan while everybody else are sitting in a holding pen.
+ * If containment wasn't an issue we could simply let the BP
+ * run the scan without involving other CPUs at all.
+ */
+
+#define SPINUNIT 50
+#define SERIAL_MCU 0
+
+struct cpumask mcu_exc_mask; /* NMI recipients */
+static int mcu_cpu = -1; /* SBOX target CPU */
+#if MCU_NMI
+static uint64_t mcu_redir; /* SBOX I/O-APIC redirection entry */
+static uint64_t mcu_old_redir; /* Restore value for redirection entry */
+#else
+unsigned int mcu_eoi; /* 1st interrupt from local APIC */
+#endif
+static atomic_t mcu_callin; /* Entry rendez-vous gate */
+static atomic_t mcu_leavin; /* Hold rendez-vous gate */
+
+
+static int
+mcu_timed_out(int64_t * timeout)
+{
+ if (*timeout < SPINUNIT)
+ return 1;
+
+ *timeout -= SPINUNIT;
+ touch_nmi_watchdog();
+ ndelay(SPINUNIT);
+
+ return 0;
+}
+
+
+static int
+mcu_wait(void)
+{
+ int cpus, order;
+ int64_t timeout;
+
+ cpus = num_online_cpus();
+ timeout = 1 * NSEC_PER_SEC; /* 1 Second */
+
+ /*
+ * Flush all caches
+ */
+
+ /*
+ * 'Entry' rendez-vous point.
+ * Wait here until all CPUs has entered.
+ */
+ order = atomic_inc_return(&mcu_callin);
+ while(atomic_read(&mcu_callin) != cpus) {
+ if (mcu_timed_out(&timeout)) {
+ /*
+ * Timout waiting for CPU enter rendez-vous
+ */
+ return -1;
+ }
+ }
+
+ /*
+ * 'Hold' rendez-vous point.
+ * All CPUs drop by here 'simultaneously'.
+ * The first CPU that 'enter'ed (order of 1) will
+ * fall thru while the others wait until their
+ * number number comes up in the 'leavin' counter
+ * (or if a timeout happens). This also has a
+ * serializing effect, where one CPU leaves this
+ * loop at a time.
+ */
+ if (order == 1) {
+#if SERIAL_MCU
+ atomic_set(&mcu_leavin, 1);
+#endif
+ }
+ else {
+ while(atomic_read(&mcu_leavin) < order) {
+ if (mcu_timed_out(&timeout)) {
+ /*
+ * Timout waiting in CPU hold rendez-vous
+ */
+ return -1;
+ }
+ }
+ }
+
+ return order;
+}
+
+
+static int
+mcu_go(int order)
+{
+ int ret;
+ int64_t timeout;
+
+ ret = -1;
+ if (order < 0)
+ goto mcu_reset;
+
+#if SERIAL_MCU
+ /*
+ * If any 'per-CPU' activity is needed in isolation
+ * (one CPU at a time) then that code needs to go here.
+ */
+
+ atomic_inc(&mcu_leavin); /* Next CPU out of hold */
+#endif
+
+ timeout = NSEC_PER_SEC; /* 1 Second */
+ if (order == 1) {
+ int cpus;
+
+ /*
+ * The first CPU that entered (order of 1) waits here
+ * for the others to leave the 'hold' loop in mca_wait()
+ * and enter the 'exit' rendez-vous loop below.
+ * Once they are there, it will run the uncore MCA bank
+ * scan while the others are parked in 'exit' loop below.
+ */
+ cpus = num_online_cpus();
+#if SERIAL_MCU
+ while(atomic_read(&mcu_leavin) <= cpus) {
+ if (mcu_timed_out(&timeout)) {
+ /*
+ * Timout waiting for CPU exit rendez-vous
+ */
+ goto mcu_reset;
+ }
+ }
+#else
+ atomic_set(&mcu_leavin, cpus);
+#endif
+ mcu_scan();
+ ret = 0;
+ }
+ else {
+ /*
+ * Exit rendez-vous point.
+ */
+ while(atomic_read(&mcu_leavin) != 0) {
+ if (mcu_timed_out(&timeout)) {
+ /*
+ * Timout waiting in CPU exit rendez-vous
+ */
+ goto mcu_reset;
+ }
+ }
+ return 0;
+ }
+
+ /*
+ * Reset rendez-vous counters, letting all CPUs
+ * leave this function 'simultaneously'.
+ */
+mcu_reset:
+ atomic_set(&mcu_callin, 0);
+ atomic_set(&mcu_leavin, 0);
+ return ret;
+}
+
+
+/*
+ * NMI exception handler
+ * Uncertain if all cpumask_* functions implies barriers,
+ * so erroring on the safe side explicit barriers is used.
+ */
+
+#if BEAM_TEST
+static int
+mcu_nmi(int cpu)
+{
+#ifdef CONFIG_MK1OM
+ uint32_t mcg_status_lo, mcg_status_hi;
+#endif
+ struct _mcu_rec * mr;
+ MceInfo mc;
+ int i, j;
+
+ if (cpu != mcu_cpu)
+ return 0;
+
+ if (! mcu_prescan())
+ return 0;
+
+ wbinvd();
+
+#ifdef CONFIG_MK1OM
+ rdmsr(MSR_IA32_MCG_STATUS, mcg_status_lo, mcg_status_hi);
+ wrmsr(MSR_IA32_MCG_STATUS, mcg_status_lo | MCG_STATUS_MCIP, mcg_status_hi);
+#endif
+
+ for(i = 0; i < ARRAY_SIZE(mcu_src); i++) {
+ mr = mcu_src + i;
+
+#ifdef CONFIG_MK1OM
+ if (mr->org == MC_ORG_TBOX && !mr_txs())
+ continue;
+#endif
+
+ for(j = 0; j < mr->num; j++) {
+ mc.status = mr->rq(j, mr->ofs + MCU_STAT);
+
+ if (! (mc.status & MCI_STATUS_VAL))
+ continue;
+
+ if (! (mc.status & MCI_STATUS_EN)) {
+ mr->wq(j, mr->ofs + MCU_STAT, 0);
+ continue;
+ }
+
+ mcu_read(mr, j, &mc);
+ mr->wq(j, mr->ofs + MCU_STAT, 0);
+
+ mc.org = mr->org;
+ mc.id = j;
+ mc.stamp = get_seconds();
+ mc.flags = (mc.status & MCI_STATUS_UC) ? MC_FLG_FATAL : 0;
+
+ micras_mc_send(&mc, 1);
+ }
+ }
+
+#ifdef CONFIG_MK1OM
+ wrmsr(MSR_IA32_MCG_STATUS, mcg_status_lo, mcg_status_hi);
+#endif
+ return 1;
+
+ /*
+ * Damn compiler options !!!!!!
+ * Don't want more changes than this routine, so
+ * added dummies to shut up gcc about unused code.
+ */
+ i = mcu_wait();
+ mcu_go(i);
+}
+#else
+
+static atomic_t mcu_entry;
+
+static int
+mcu_nmi(int cpu)
+{
+#ifdef CONFIG_MK1OM
+ uint32_t mcg_status_lo, mcg_status_hi;
+#endif
+ int order, eoi;
+
+ atomic_inc(&mcu_entry);
+
+ /*
+ * Get MCA status from SBOX.
+ */
+#if 0
+ /*
+ * If no source bits set, this was not an un-core MCA
+ * This would work if the SBOX_MCA_INT_STAT actually worked
+ * as described both in HAS and register specification.
+ * Unfortunately, it doesn't, as per tribal knowledge errata.
+ */
+ uint32_t int_stat, int_en;
+
+ int_en = mr_sbox_rl(0, SBOX_MCA_INT_EN);
+ int_stat = mr_sbox_rl(0, SBOX_MCA_INT_STAT);
+ if (! (int_en & int_stat)) {
+ atomic_dec(&mcu_entry);
+ return 0;
+ }
+#else
+ /*
+ * Instead of having a single source of pending un-core MCA events,
+ * we now have to walk all BOXes to check if there is a valid event
+ * pending in one of them. That is much more expensive as we have
+ * to check this on all NMIs, including our own cascade NMIs used
+ * to corrall all CPUs in their rendezvouz point(s). We try to avoid
+ * this scan if there already is an un-core NMI in progress.
+ * We know that:
+ * un-core MCA NMIs are sent to just one CPU, mcu_cpu
+ * CPUs targeted in the cascade are in mcu_exc_mask
+ * non-zero atomic variable 'mcu_callin' tells cascade is in progress
+ */
+ if (!cpumask_empty(&mcu_exc_mask))
+ goto invited;
+ if (cpu != mcu_cpu) {
+ atomic_dec(&mcu_entry);
+ return 0;
+ }
+
+ /*
+ * On CPU 0 and no un-core handling in progress!
+ * Then scan all BOXes for valid events pending,
+ * If there wasn't any, this is a false alarm and
+ * we'll re-connect MC lines and return.
+ */
+ if (! mcu_prescan()) {
+ atomic_dec(&mcu_entry);
+ return 0;
+ }
+
+invited:
+#endif
+
+ /*
+ * Flush all caches.
+ * This is uncore so it should not be necessary to
+ * empty internal (L1) caches, doesn't harm either.
+ */
+ wbinvd();
+
+ /*
+ * We do not want to be interrupted by a core MC
+ * exception while handling an NMI. We can block
+ * core MC events by setting the MCG_STATUS_MCIP.
+ * This is a MSR, so it has to be done on all CPUs.
+ * On KnC that is, KnF does not have that MSR.
+ */
+#ifdef CONFIG_MK1OM
+ rdmsr(MSR_IA32_MCG_STATUS, mcg_status_lo, mcg_status_hi);
+ wrmsr(MSR_IA32_MCG_STATUS, mcg_status_lo | MCG_STATUS_MCIP, mcg_status_hi);
+#endif
+
+ /*
+ * Special for the SBOX NMI target CPU:
+ * - disconnect un-core MC lines from SBOX I/O-APIC, such
+ * that we don't get stacked NMIs in the Local APICs.
+ * - simulate a NMI broadcast by sending NMI to all _other_
+ * active CPUs via IPIs. The SBOX could do a broadcast,
+ * but that will send NMIs to sleeping CPUs too, which
+ * we prefer to avoid if possible.
+ *TBD: should creating the mcu_exc_mask be protected by
+ * lock, similar to core events? Who can interfere?
+ */
+ if (cpu == mcu_cpu) {
+ mr_sbox_wl(0, SBOX_MCA_INT_EN, 0);
+ cpumask_copy(&mcu_exc_mask, cpu_online_mask);
+ cpumask_clear_cpu(cpu, &mcu_exc_mask);
+ smp_wmb();
+ // apic->send_IPI_mask(&mcu_exc_mask, NMI_VECTOR);
+ apic->send_IPI_allbutself(NMI_VECTOR);
+#if !MCU_NMI
+ if (mcu_eoi) {
+ smp_rmb();
+ cpumask_set_cpu(cpu, &mcc_exc_mask);
+ smp_wmb();
+ mcu_eoi = 0;
+ }
+#endif
+ }
+
+ /*
+ * Corral all CPUs through the rendez-vous point maze.
+ * It guarantees that:
+ * - No CPU leaves mcu_wait() until all has entered.
+ * - One CPU leaves mcu_wait() at a time.
+ * - No CPU leaves mcu_go() until all has entered.
+ * - While one CPU is in transit between mcu_wait()
+ * and mcu_go(), all other CPUs are sitting in
+ * tight busy-wait loops in either function.
+ * - All CPUs leaves mcu_go() at the same time.
+ * If there is any 'per-cpu' activity that needs to be
+ * run in isolation, it must be placed between mcu_wait()
+ * and mcu_go().
+ */
+ order = mcu_wait();
+ if (mcu_go(order)) {
+ /*
+ * Timeout waiting at one of the rendez-vous points.
+ * Scan the un-core MCA banks just in case.
+ */
+ mcu_scan();
+ }
+
+ /*
+ * Special for the SBOX NMI target CPU:
+ * - reconnect un-core MC lines through to SBOX I/O-APIC.
+ * If new events already are pending, then this will
+ * result in a 'rising-edge' trigger to the I/O-APIC.
+ */
+ if (cpu == mcu_cpu)
+ mr_sbox_wl(0, SBOX_MCA_INT_EN, mr_txs() ? 0x0fffff07 : 0xff07);
+
+ /*
+ * If this CPU got its NMI from an IPI, then it must
+ * send an ACK to its local APIC (I think).
+ */
+ smp_rmb();
+ eoi = cpumask_test_and_clear_cpu(cpu, &mcu_exc_mask);
+ smp_wmb();
+ if (eoi)
+ ack_APIC_irq();
+
+ /*
+ * Restore core MCG status and return 1 indicating to the
+ * kernel NMI handler we've handled it.
+ *TBD: reduce to one write per core instead of one per thread?
+ */
+#ifdef CONFIG_MK1OM
+ wrmsr(MSR_IA32_MCG_STATUS, mcg_status_lo, mcg_status_hi);
+#endif
+ atomic_dec(&mcu_entry);
+ return 1;
+}
+#endif
+
+
+#if !MCU_NMI
+/*
+ * MCA handler if using standard interrupts
+ * It's just a trampoline to convert a regular interrupt
+ * into an NMI, which is only needed if the I/O-APIC can't
+ * generate and NMI.
+ *
+ *TBD: remove all this? It is not used on KnC, and the KnF's
+ * I've tested this on all have been OK sending NMIs.
+ */
+
+static irqreturn_t
+sbox_handler(int irq, void * tag)
+{
+ /*
+ * Convert this regular interrupt into an NMI.
+ */
+ mcu_cpu = smp_processor_id();
+ mcu_eoi = 1;
+ apic->send_IPI_self(NMI_VECTOR);
+ return IRQ_HANDLED;
+}
+#endif
+
+
+/*
+ * Reset all uncore MCA banks to defaults
+ */
+
+void
+box_reset(int arm)
+{
+ int i, j;
+ struct _mcu_rec * mr;
+
+ for(i = 0; i < ARRAY_SIZE(mcu_src); i++) {
+ mr = mcu_src + i;
+
+#ifdef CONFIG_MK1OM
+ if (mr->org == MC_ORG_TBOX && !mr_txs())
+ continue;
+#endif
+
+ for(j = 0; j < mr->num; j++) {
+ uint64_t status;
+
+ /*
+ *TBD: Do we want to pick up existing MCA events or drop
+ * them because we don't know _when_ they occurred?
+ * Reporting them would require internal buffer because
+ * it's unlikely the SCIF MC session is up at this point.
+ * For now we just enter events into the system log.
+ */
+ status = mr->rq(j, mr->ofs + MCU_STAT);
+ if (status & MCI_STATUS_VAL) {
+ MceInfo mc;
+
+ mcu_read(mr, j, &mc);
+ printk("RAS.uncore: discard MC event:\n"
+ "bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n",
+ mr->org, j, mc.ctl, status, mc.addr, mc.misc);
+ }
+
+ /*
+ * Reset MCA bank registers.
+ */
+ mcu_reset(mr, j, arm);
+ }
+ }
+}
+
+
+/*
+ * Setup interrupt handlers by hooking into the SBOX's I/O-APIC.
+ * For now, we send an NMI to single CPU, and let it process the
+ * event. This may need to be expanded into a broadcast NMI similar
+ * to what the generic core MC event handler does in order to keep
+ * containment at high as we possibly can.
+ *
+ *TBD: code a dual rendez-vous mechanism on all active CPUs.
+ */
+
+int __init
+mcu_init(void)
+{
+#if MC_VERBOSE
+ int i, j;
+#endif
+
+ if (mce_disabled) {
+ printk("RAS.uncore: disabled\n");
+ }
+ else {
+ /*
+ * Clear rendez-vous counters
+ */
+ atomic_set(&mcu_callin, 0);
+ atomic_set(&mcu_leavin, 0);
+
+#if MC_VERBOSE
+ /*
+ * For debug only:
+ * Record all SBOX I/O-APIC registers to kernel log
+ */
+ printk("SBOX_APICIDR: %lx\n", mr_sbox_rl(0, SBOX_APICIDR));
+ printk("SBOX_APICVER: %lx\n", mr_sbox_rl(0, SBOX_APICVER));
+ printk("SBOX_APICAPR: %lx\n", mr_sbox_rl(0, SBOX_APICAPR));
+ for(i = 0; i < 26 ; i++)
+ printk("APICCRT%d: %llx\n", i, mr_sbox_rq(0, SBOX_APICRT0 + (8 * i)));
+ for(i = 0; i < 8 ; i++)
+ printk("APICICR%d: %llx\n", i, mr_sbox_rq(0, SBOX_APICICR0 + (8 * i)));
+ printk("SBOX_MCA_INT_EN: %lx\n", mr_sbox_rl(0, SBOX_MCA_INT_EN));
+ printk("SBOX_MCA_INT_STAT: %lx\n", mr_sbox_rl(0, SBOX_MCA_INT_STAT));
+#endif
+
+ /*
+ * Disconnect un-core MC lines from SBOX I/O-APIC, setup the
+ * individual BOXes, and clear any un-core MC pending flags
+ * from SBOX I/O-APIC
+ */
+ mr_sbox_wl(0, SBOX_MCA_INT_EN, 0);
+ box_reset(1);
+ mr_sbox_wl(0, SBOX_MCA_INT_STAT, 0);
+
+ /*
+ * Setup the SBOX I/O-APIC.
+ * Un-core MC events are routed through a mask in register
+ * SBOX_MCA_INT_EN into I/O APIC redirection table entry #16.
+ * Ideally we want all uncore MC events to be handled similar
+ * to core MCAs, which means we'd like an NMI on all CPUs.
+ * On KnF the I/O-APIC may not trigger an NMI (PoC security)
+ * and on KnC where NMI delivery is possible, it appears not
+ * to be ideal to broadcast it to all CPUs because it could
+ * wake up cores put to sleep bu power management rules.
+ * See MCA HAS, SBOX HAS Vol 4, and A0 Vol 2 for details.
+ *
+ * The redirection table entry has the following format:
+ * 47:32 Destination ID field
+ * 17 Interrrupt set (testing: trigger an interrupt)
+ * 16 Interrupt mask (0=enable, 1=disable)
+ * 15 Trigger mode (0=edge, 1=level)
+ * 14 Remote IRR (0=inactive, 1=accepted)
+ * 13 Interrupt polarity (0=active_high, 1=active_low)
+ * 12 Delivery status (0=idle, 1=send_pending)
+ * 11 Destination mode (0=physical, 1=logical)
+ * 10:8 Delivery mode (0=fixed, low, SMI, rsvd, NMI, INIT, rsvd, ext)
+ * 7:0 Interrupt vector
+ *
+ * The I/O-APIC input is 'rising edge', so we'd need to select
+ * it to be edge triggered, active high.
+ */
+#if MCU_NMI
+ /*
+ * If event delivery by NMI is preferred, we want it delivered on
+ * the BP. There is already an NMI handler present, so we have to
+ * tap into the existing NMI handler for the event notifications.
+ *
+ * The bit-fiddling below says:
+ * NMI delivery | Destination CPU APIC ID
+ */
+ mcu_cpu = 0;
+ mcu_redir = PUT_BITS(10, 8, 4) | PUT_BITS(47, 32, (uint64_t) cpu_data(mcu_cpu).apicid);
+ mcu_old_redir = mr_sbox_rq(0, SBOX_APICRT16);
+ mr_sbox_wq(0, SBOX_APICRT16, mcu_redir | PUT_BITS(16, 16, 1));
+ mr_sbox_wq(0, SBOX_APICRT16, mcu_redir);
+#else
+ /*
+ * If event delivery by regular interrupt is preferred, then all
+ * I/O-APIC setup will be handled by calling request_irq(16,..).
+ * There is no guarantee that the event will be sent to the BP
+ * (though it's more than likely) so we'll defer indentifying the
+ * event handling CPU (mcu_cpu) till we receive the callback from
+ * the interrupt handling sus-system.
+ * The sbox_handler() function just converts the callback into an
+ * NMI because the only way containment can be achieved is to be
+ * able to lock down the system completely, which is not realistic
+ * using regular interrupts.
+ */
+ mcu_eoi = 0;
+ (void) request_irq(16, sbox_handler, IRQF_TRIGGER_HIGH, "un-core mce", (void *) 42);
+#endif
+
+ /*
+ * Finally, place hook in NMI handler in case there's
+ * an un-core event pending and connect un-core MC lines
+ * through to SBOX I/O-APIC. From this point onwards we
+ * can get uncore MC events at any time.
+ */
+ mca_nmi = mcu_nmi;
+ mr_sbox_wl(0, SBOX_MCA_INT_EN, mr_txs() ? 0x0fffff07 : 0xff07);
+
+#if MC_VERBOSE
+ /*
+ * For debug only
+ * Record initial uncore MCA banks to kernel log.
+ */
+ printk("RAS.uncore: dumping all banks\n");
+
+ /*
+ * Dump all MCA registers we set to kernel log
+ */
+ for(i = 0; i < ARRAY_SIZE(mcu_src); i++) {
+ char * boxname;
+ struct _mcu_rec * mr;
+ uint64_t ctl, stat, addr, misc;
+
+ mr = mcu_src + i;
+#ifdef CONFIG_MK1OM
+ if (mr->org == MC_ORG_TBOX && !mr_txs())
+ continue;
+#endif
+ switch(mr->org) {
+ case MC_ORG_SBOX: boxname = "SBOX"; break;
+ case MC_ORG_DBOX: boxname = "DBOX"; break;
+ case MC_ORG_GBOX: boxname = "GBOX"; break;
+ case MC_ORG_TBOX: boxname = "TBOX"; break;
+ default: boxname = "??"; /* Damn compiler */
+ }
+
+ for(j = 0; j < mr->num; j++) {
+
+ if (mr->qflg & MCU_CTL_64)
+ ctl = mr->rq(j, mr->ofs + MCU_CTRL);
+ else
+ ctl = (uint64_t) mr->rl(j, mr->ofs + MCU_CTRL);
+
+ stat = mr->rq(j, mr->ofs + MCU_STAT);
+
+ if (mr->qflg & MCU_NO_ADDR)
+ addr = 0;
+ else {
+ if (mr->qflg & MCU_ADDR_32)
+ addr = (uint64_t) mr->rl(j, mr->ofs + MCU_ADDR);
+ else
+ addr = mr->rq(j, mr->ofs + MCU_ADDR);
+ }
+
+ if (mr->qflg & MCU_NO_MISC)
+ misc = 0;
+ else {
+ if (mr->qflg & MCU_MISC_64)
+ misc = mr->rq(j, mr->ofs + MCU_MISC);
+ else
+ misc = (uint64_t) mr->rl(j, mr->ofs + MCU_MISC);
+ }
+
+ printk("RAS.uncore: %s[%d] = { %llx, %llx, %llx, %llx }\n",
+ boxname, j, ctl, stat, addr, misc);
+ }
+ }
+ printk("RAS.uncore: MCA_INT_EN = %x\n", mr_sbox_rl(0, SBOX_MCA_INT_EN));
+ printk("RAS.uncore: APICRT16 = %llx\n", mr_sbox_rq(0, SBOX_APICRT16));
+#endif
+
+ printk("RAS.uncore: init complete\n");
+ }
+
+ return 0;
+}
+
+
+/*
+ * Cleanup for module unload.
+ * Clear/restore hooks in the SBOX's I/O-APIC.
+ */
+
+int __exit
+mcu_exit(void)
+{
+ if (! mce_disabled) {
+
+ /*
+ * Disconnect uncore MC lines from SBOX I/O-APIC.
+ * No new uncore MC interrupts will be made.
+ */
+ mr_sbox_wl(0, SBOX_MCA_INT_EN, 0);
+
+ /*
+ * Disconnect exception handler.
+ */
+#if MCU_NMI
+ mcu_redir = 0;
+ mr_sbox_wq(0, SBOX_APICRT16, mcu_old_redir);
+#else
+ mcu_eoi = 0;
+ free_irq(16, (void *) 42);
+#endif
+
+ /*
+ * Cut link from kernel's NMI handler and
+ * wait for everybody in handler to leave.
+ */
+ mca_nmi = 0;
+ while(atomic_read(&mcu_entry))
+ cpu_relax();
+ mcu_cpu = -1;
+
+ /*
+ * No more events will be received, clear
+ * MC reporting in all BOXes (just in case)
+ */
+ box_reset(0);
+ }
+
+ printk("RAS.uncore: exit complete\n");
+ return 0;
+}
+
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * The Monahan GX processor implementation of the I2C unit does not support
+ * the hardware general call, 10-bit slave addressing or CBUS compatibility.
+ * Otherwise it is compliant with I2C spec version 2.1.
+ *
+ * This is the SBOX 'OverClock' bus controller, which for reference is
+ * mostly like the I2C controller on PXA270 with the above limitations.
+ */
+
+#ifndef _MONAHAN_H_
+#define _MONAHAN_H_ 1
+
+/*
+**
+** Layer 1 stuff
+**
+** Offsets and bit definitions for the Monahans I2C controller.
+** This is equivalent to defines in 'i2c-pxa.c', but kept separate.
+*/
+
+/*
+ * Register locations (base SBOX register SBOX_OC_I2C_ICR)
+ */
+#define ICR_OFFSET 0x00
+#define ISR_OFFSET 0x04
+#define ISAR_OFFSET 0x08
+#define IDBR_OFFSET 0x0c
+#define IBMR_OFFSET 0x10
+
+/*
+ * I2C Control Register bits
+ */
+#define ICR_START 0x00000001 /* Start bit */
+#define ICR_STOP 0x00000002 /* Stop bit */
+#define ICR_ACKNAK 0x00000004 /* Send ACK(0) or NAK(1) */
+#define ICR_TB 0x00000008 /* Transfer byte bit */
+#define ICR_MA 0x00000010 /* Master abort */
+#define ICR_SCLE 0x00000020 /* Master clock enable */
+#define ICR_IUE 0x00000040 /* Unit enable */
+#define ICR_GCD 0x00000080 /* General call disable */
+#define ICR_ITEIE 0x00000100 /* Enable tx interrupts */
+#define ICR_DRFIE 0x00000200 /* Enable rx interrupts */
+#define ICR_BEIE 0x00000400 /* Enable bus error ints */
+#define ICR_SSDIE 0x00000800 /* Slave STOP detected int enable */
+#define ICR_ALDIE 0x00001000 /* Enable arbitration interrupt */
+#define ICR_SADIE 0x00002000 /* Slave address detected int enable */
+#define ICR_UR 0x00004000 /* Unit reset */
+#define ICR_MODE 0x00018000 /* Bus speed mode */
+#define ICR_RESERVED 0xfffe0000 /* Unused */
+
+/*
+ * Bus speed control values
+ * High speed modes are not supported by controller.
+ */
+#define ICR_STANDARD_MODE 0x00000000 /* 100k operation */
+#define ICR_FAST_MODE 0x00008000 /* 400k operation */
+#define ICR_HS_STANDARD_MODE 0x00010000 /* 3.4M/100k operation */
+#define ICR_HS_FAST_MODE 0x00018000 /* 3.4M/400k operation */
+
+/*
+ * Shorthands
+ */
+#define ICR_ON (ICR_IUE | ICR_SCLE) /* Turn unit on */
+#define ICR_INIT_BITS (ICR_ITEIE | \
+ ICR_DRFIE | \
+ ICR_BEIE | \
+ ICR_SADIE | \
+ ICR_FAST_MODE | \
+ ICR_ON) /* Init flags */
+
+/*
+ * I2C Status Register bits
+ */
+#define ISR_RWM 0x00000001 /* Read(1)/write(0) mode */
+#define ISR_ACKNAK 0x00000002 /* Ack(0)/nak(1) sent or received */
+#define ISR_UB 0x00000004 /* Unit busy */
+#define ISR_IBB 0x00000008 /* Bus busy */
+#define ISR_SSD 0x00000010 /* Slave stop detected */
+#define ISR_ALD 0x00000020 /* Arbitration loss detected */
+#define ISR_ITE 0x00000040 /* Tx buffer empty */
+#define ISR_IRF 0x00000080 /* Rx buffer full */
+#define ISR_GCAD 0x00000100 /* General call address detected */
+#define ISR_SAD 0x00000200 /* Slave address detected */
+#define ISR_BED 0x00000400 /* Bus error no ACK/NAK */
+#define ISR_RESERVED 0xfffff800 /* Unused */
+
+#define ISR_INTS (ISR_SSD | \
+ ISR_ALD | \
+ ISR_ITE | \
+ ISR_IRF | \
+ ISR_SAD | \
+ ISR_BED) /* Interrupt flags */
+/*
+ * I2C Slave Address Register bits
+ */
+#define ISAR_SLADDR 0x0000007f /* 7-bit address for slave-receive mode */
+#define ISAR_RESERVED 0xffffff80 /* Unused */
+
+/*
+ * I2C Data Buffer Register bits
+ */
+#define IDBR_DATA 0x000000ff /* 8-bit data buffer */
+#define IDBR_RESERVED 0xffffff00 /* Unused */
+
+/*
+ * I2C Bus Monitor Register bits
+ */
+#define IBMR_SDA 0x00000001 /* State of SDA pin */
+#define IBMR_SCL 0x00000002 /* State of SCL pin */
+#define IBMR_RESERVED 0xfffffffc /* Unused */
+
+
+/*
+**
+** Layer 2 stuff
+**
+*/
+
+/*
+ * Bus speed selections
+ */
+#define I2C_STANDARD ICR_STANDARD_MODE
+#define I2C_FAST ICR_FAST_MODE
+#define I2C_HS_STANDARD ICR_HS_STANDARD_MODE
+#define I2C_HS_FAST ICR_HS_FAST_MODE
+
+/*
+ * Command types
+ */
+#define I2C_INVALID -1 /* Internal, not to be used */
+#define I2C_WRITE 0 /* Next transfer will be outgoing */
+#define I2C_READ 1 /* Next transfer will be incoming */
+#define I2C_NOP 2 /* Idle state */
+
+/*
+ * Return codes
+ */
+#define XFER_SUCCESS 0 /* All OK */
+#define INCOMPLETE_XFER -1 /* Basic timeout */
+#define TX_CONTROLLER_ERROR -2 /* Requires reset */
+#define TX_NAK -3 /* NAK, master to send a stop */
+#define RX_SEVERE_ERROR -4 /* Requires reset */
+#define RX_END_WITHOUT_STOP -5 /* Deprecated */
+#define RX_BIZARRE_ERROR -6 /* Doesn't require reset */
+
+
+/*
+**
+** Layer 3 stuff
+**
+*/
+
+/*
+ * Frequency selections
+ */
+#define FREQ_MAX -3 /* As fast as possible */
+#define FREQ_400K -2 /* 400 kHz */
+#define FREQ_100K -1 /* 100 kHz */
+#define FREQ_AUTO 0 /* Default speed */
+
+/*
+ * Return codes: standard kernel codes used
+ * EBUSY, ENODEV, ENXIO, EINVAL, EIO
+ */
+
+#endif /* Recursion block */
--- /dev/null
+obj-m := trace_capture.o
--- /dev/null
+#
+# Trace Capture module
+#
+
+export ARCH = l1om
+
+KERNELDIR = $(CURDIR)/../../mic_linux
+KBUILD := $(MAKE) -C $(KERNELDIR) ARCH=$(ARCH) M=$(CURDIR)
+
+ifneq ($(DESTDIR),)
+INSTALL_MOD_PATH = $(DESTDIR)
+endif
+
+ifeq ($(shell \which x86_64-$(ARCH)-linux-gcc 2>/dev/null),)
+export PATH := $(PATH):$(CURDIR)/../cross/bin
+endif
+
+.PHONY: default modules install modules_install clean
+
+default: modules tests
+
+modules:
+ +$(KBUILD) $@
+
+install: modules_install
+
+modules_install:
+ +$(KBUILD) INSTALL_MOD_PATH=$(DESTDIR) modules_install
+
+clean:
+ +$(KBUILD) clean
+
+tests:
+ echo no tests
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h> /* open */
+#include <unistd.h> /* exit */
+#include <sys/ioctl.h> /* ioctl */
+
+#include "trace_capture.h"
+
+void
+ioctl_start_capture(int file_desc, long trigger)
+{
+ ioctl(file_desc, MICTC_START_CAPTURE, trigger);
+}
+
+int
+main (int argc, char *argv[])
+{
+ int file_desc;
+ long trigger = 1;
+
+ if ((file_desc = open(MICTC_FILE_NAME, 0)) < 0) {
+ printf("Can't open device file: %s\n", MICTC_FILE_NAME);
+ exit(-1);
+ }
+
+ if (argc == 2) {
+ trigger = atoi(argv[1]);
+ printf("Trigger %ld\n", trigger);
+ }
+
+ ioctl_start_capture(file_desc, trigger);
+ printf("Done.\n");
+
+ close(file_desc);
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include "../include/scif.h"
+#include "trace_capture.h"
+
+#define BARRIER(epd, string) { \
+ printf("%s\n", string); \
+ if ((err = scif_send(epd, &control_msg, sizeof(control_msg), 1)) <= 0) { \
+ printf("scif_send failed with err %d\n", errno); \
+ fflush(stdout); \
+ goto close; \
+ } \
+ if ((err = scif_recv(epd, &control_msg, sizeof(control_msg), 1)) <= 0) { \
+ printf("scif_recv failed with err %d\n", errno); \
+ fflush(stdout); \
+ goto close; \
+ } \
+}
+
+#if 0
+// These are common to the Host App
+// and the MIC driver Trace Capture Feature
+// COMMON DEFINES START HERE
+enum TRACE_COMMAND {
+ TRACE_NOP = 100,
+ TRACE_DATA,
+ TRACE_HOST_READY,
+ TRACE_DONE,
+ TRACE_ERROR,
+ TRACE_PRINT,
+ TRACE_GET_FILE,
+ TRACE_PAGE_READY,
+ TRACE_REG_COMPLETE,
+ TRACE_MEM_COMPLETE,
+ TRACE_COMPLETE
+};
+
+#define TRACE_STATUS_OFFSET 8
+#define TRACE_SIZE_OFFSET 12
+
+// Enable/Disable Memory Test.
+// This MUST be enabled simultaneously on Host App as well.
+#define MIC_TRACE_CAPTURE_MEMORY_TEST 0
+
+#if MIC_TRACE_CAPTURE_MEMORY_TEST
+#define TRACE_CHECKSUM_OFFSET 16
+#endif
+
+#define TRACE_TRIGGER_OFFSET 20
+#define TRACE_DATA_OFFSET 4096
+
+// Types of Triggers - Refer to uOS Trace Capture Wiki for Usage
+// Generic counter
+#define TRACE_HOST_GENERIC_COUNTER 0x1
+// Async Flip counter
+#define TRACE_HOST_FRAME_COUNTER 0x2
+// COMMON DEFINES END HERE
+#endif
+
+// End points for SCIF
+//static scif_epd_t mictc_epd_cmd;
+static scif_epd_t mictc_epd_data;
+
+// SCIF ports - temp hack; move to scif.h
+#define MICTC_SCIF_PORT_DATA 300
+
+static volatile uint64_t *g_traceBufferStatusOffset = NULL;
+static volatile uint64_t *g_traceBufferSizeOffset = NULL;
+static volatile uint32_t *g_traceBufferDataOffset = NULL;
+static volatile uint32_t *g_traceBufferTriggerOffset = NULL;
+
+// This is an array of trigger numbers. The value TRACE_EOL is ignored.
+static uint32_t g_traceTriggers[TRACE_TRIGGER_MAX];
+
+static struct scif_portID portID_data;
+static scif_epd_t mictc_newepd;
+
+static void *g_mictc_buffer_base;
+static void *g_mictc_buffer_offset_xml;
+static off_t g_mictc_buffer_offset_mem;
+
+FILE *fp;
+
+static
+int open_scif_channels(void)
+{
+ int err;
+ struct pollfd spollfd;
+ int control_msg = 0;
+ long scif_offset_dst;
+ int timeout = 0;
+ int page_count = 0;
+ int i;
+
+ if ((err = posix_memalign(&g_mictc_buffer_base, 0x1000, MICTC_MEM_BUFFER_SIZE))) {
+ fprintf(stderr, "posix_memalign failed failed with %d\n", err);
+ return 0;
+ }
+ // Data channel
+ if ((mictc_epd_data = scif_open()) == SCIF_OPEN_FAILED) {
+ fprintf(stderr, "scif_open failed with ENOMEM\n", errno);
+ return 0;
+ }
+
+ if (scif_bind(mictc_epd_data, MICTC_SCIF_PORT_DATA) == -1) {
+ fprintf(stderr, "scif_bind failed with error %d\n", errno);
+ return 0;
+ }
+
+ portID_data.node = 1;
+ portID_data.port = MICTC_SCIF_PORT_DATA;
+
+ if (scif_listen(mictc_epd_data, 1) == -1) {
+ fprintf(stderr, "scif_listen failed with error %d\n", errno);
+ return 0;
+ }
+
+ while (1) {
+ printf("scif_accept in poll mode until a connect request is found\n");
+ err = 1;
+ while (err) {
+ spollfd.fd = scif_get_fd(mictc_epd_data);
+ spollfd.events = POLLIN;
+ spollfd.revents = 0;
+ if ((err = poll(&spollfd, 1, -1)) < 0) {
+ printf("poll failed with err %d\n", errno);
+ }
+ if (((err = scif_accept(mictc_epd_data, &portID_data, &mictc_newepd, 0)) < 0) && (errno != EAGAIN)) {
+ printf("scif_accept failed with err %d\n", errno);
+ return 0;
+ }
+ }
+
+ printf("scif_accept from port %d complete\n", portID_data.port);
+
+ if ((g_mictc_buffer_offset_mem = scif_register(mictc_newepd, g_mictc_buffer_base, MICTC_MEM_BUFFER_SIZE, 0, // suggested_offset,
+ SCIF_PROT_READ | SCIF_PROT_WRITE, 0)) < 0) {
+ fprintf(stderr, "scif_register failed with err %d\n", errno);
+ return 0;
+ }
+
+ printf("After scif_register, g_mictc_buffer_offset_mem = %llx\n",
+ (unsigned long long)g_mictc_buffer_offset_mem);
+ fflush(stdout);
+
+ // printf("Before scif_send\n");
+ // fflush(stdout);
+
+ BARRIER(mictc_newepd, "before barrier");
+
+ if ((err =
+ scif_send(mictc_newepd, &g_mictc_buffer_offset_mem, sizeof(g_mictc_buffer_offset_mem),
+ SCIF_SEND_BLOCK)) <= 0) {
+ printf("scif_send failed with err %d\n", errno);
+ fflush(stdout);
+ goto close;
+ }
+ // BARRIER(mictc_newepd, "scif_send");
+
+ // printf("scif_offset = %lx\n", scif_offset);
+ // fflush(stdout);
+
+ printf("Before scif_recv\n");
+ fflush(stdout);
+
+ if ((err = scif_recv(mictc_newepd, &scif_offset_dst, sizeof(scif_offset_dst), SCIF_RECV_BLOCK)) <= 0) {
+ printf("scif_recv failed with err %d\n", errno);
+ fflush(stdout);
+ goto close;
+ }
+ printf("scif_offset_dst = %lx\n", scif_offset_dst);
+
+ printf("Before scif_mmap\n");
+
+ if ((g_mictc_buffer_offset_xml = scif_mmap(0, // physical address
+ MICTC_XML_BUFFER_SIZE, // length
+ SCIF_PROT_READ | SCIF_PROT_WRITE, // protection
+ 0, // flags
+ mictc_newepd, // endpoint
+ scif_offset_dst) // offset
+ ) == (void *)-1) {
+ fprintf(stderr, "scif_mmap failed with err %d\n", errno);
+ return 0;
+ }
+
+ g_traceBufferStatusOffset = (uint64_t *) (g_mictc_buffer_offset_xml + TRACE_STATUS_OFFSET);
+ g_traceBufferSizeOffset = (uint64_t *) (g_mictc_buffer_offset_xml + TRACE_SIZE_OFFSET);
+ g_traceBufferDataOffset = (uint32_t *) (g_mictc_buffer_offset_xml + TRACE_DATA_OFFSET);
+ g_traceBufferTriggerOffset = (uint32_t *) (g_mictc_buffer_offset_xml + TRACE_TRIGGER_OFFSET);
+
+ for (i = 0; i < TRACE_TRIGGER_MAX; i++) {
+ *g_traceBufferTriggerOffset = g_traceTriggers[i];
+ g_traceBufferTriggerOffset++;
+ }
+
+ *g_traceBufferStatusOffset = TRACE_HOST_READY;
+
+ printf("Before fopen\n");
+
+ if ((fp = fopen("cpu.xml", "w")) == NULL) {
+ fprintf(stderr, "Cannot open file cpu.xml.\n");
+ }
+
+ printf("Waiting for TRACE_REG_COMPLETE or TRACE_ABORTED");
+ fflush(stdout);
+
+ while (*g_traceBufferStatusOffset != TRACE_REG_COMPLETE) {
+ printf(".");
+ fflush(stdout);
+ sleep(1);
+ if (timeout++ >= 200) {
+ // Hmmm, something is hung up. Save everything in the buffer ignoring length.
+ printf("Punt!\n");
+ fprintf(fp, "%s\n", (char *)g_traceBufferDataOffset);
+ *g_traceBufferStatusOffset = TRACE_GET_FILE;
+ fclose(fp);
+ sleep(5);
+ goto close; // and quit
+ }
+ // If this happens the current trigger was not one we want -- reset and wait.
+ if (*g_traceBufferStatusOffset == TRACE_ABORTED) {
+ printf("\nAborted trace\n");
+ fflush(stdout);
+ goto close2;
+ }
+ }
+ printf("\n");
+
+ {
+ int j;
+
+ asm volatile ("lfence" ::: "memory");
+ j = *g_traceBufferSizeOffset;
+ fprintf(fp, "%*s\n", j, (char *)g_traceBufferDataOffset);
+ }
+ *g_traceBufferStatusOffset = TRACE_GET_FILE;
+ fclose(fp);
+ sleep(5);
+
+ // Memory dump
+
+ if ((fp = fopen("mem.dat", "w")) == NULL) {
+ fprintf(stderr, "Cannot open file mem.dat.\n");
+ }
+
+ printf("Waiting for memory pages\n");
+ fflush(stdout);
+
+ timeout = 0;
+
+ {
+ long i = 0;
+
+ while (*g_traceBufferStatusOffset != TRACE_MEM_COMPLETE) {
+ //printf("status %d\n", *g_traceBufferStatusOffset);
+
+ if (*g_traceBufferStatusOffset == TRACE_PAGE_READY) {
+ printf(" %ld", i++);
+ fflush(stdout);
+ asm volatile ("lfence" ::: "memory");
+
+ if (fwrite(g_mictc_buffer_base, *g_traceBufferSizeOffset, 1, fp) != 1) {
+ fprintf(stderr, "\nCannot write file mem.dat. error = %d\n", ferror(fp));
+ return 0;
+ }
+ *g_traceBufferStatusOffset = TRACE_HOST_READY; // Get next page
+ timeout = 0;
+ } else {
+ // printf(".");
+ // fflush(stdout);
+ usleep(10000);
+
+ if (timeout++ >= 2000) {
+ // Hmmm, something is hung up. Just close and quit.
+ printf("Punt!\n");
+ fclose(fp);
+ sleep(5);
+ goto close; // and quit
+ }
+ }
+ }
+ }
+ close1:
+ printf("\nClosing memory dump file.\n");
+ fflush(stdout);
+ fclose(fp);
+ *g_traceBufferStatusOffset = TRACE_COMPLETE; // File is closed; tell driver we are done.
+ printf("Done.\n");
+ fflush(stdout);
+ close2:
+ sleep(2);
+ scif_munmap(g_mictc_buffer_offset_xml, MICTC_XML_BUFFER_SIZE);
+ scif_unregister(mictc_newepd, (off_t) g_mictc_buffer_base, MICTC_MEM_BUFFER_SIZE);
+ scif_close(mictc_newepd);
+ } // while (1)
+ close:
+ scif_munmap(g_mictc_buffer_offset_xml, MICTC_XML_BUFFER_SIZE);
+ scif_close(mictc_newepd);
+ scif_close(mictc_epd_data);
+ free(g_mictc_buffer_base);
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ int i;
+
+ for (i = 0; i < TRACE_TRIGGER_MAX; i++) {
+ g_traceTriggers[i] = TRACE_EOL;
+ }
+
+ if (argc >= 2) {
+ for (i = 1; i < argc; i++) {
+ if (i > TRACE_TRIGGER_MAX) break;
+
+ g_traceTriggers[i - 1] = atoi(argv[i]);
+ printf("Trigger %d\n", g_traceTriggers[i - 1]);
+ }
+ } else {
+ printf("No triggers -- accept everything\n");
+ }
+
+ if (!open_scif_channels())
+ exit(1);
+
+ exit(0);
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include "../include/scif.h"
+
+// Use 2MB for KNF and 4MB for KNC.
+#define MICTC_XML_BUFFER_SIZE (2 * 1024 * 1024)
+
+// Memory transfer window. 1GB
+#define MICTC_MEM_BUFFER_SIZE (1 * 1024UL * 1024UL * 1024UL)
+
+FILE *ip;
+FILE *op;
+
+
+int main(void)
+{
+ long srcPhysAddr = 0;
+ uint32_t page_buf[4096/4];
+ long i = 0;
+ int size;
+ char dest[64];
+
+ if ((ip = fopen("mem.dat", "r")) == NULL) {
+ fprintf(stderr, "Cannot open file mem.dat.\n");
+ }
+
+ if ((op = fopen("memfmt.txt", "w")) == NULL) {
+ fprintf(stderr, "Cannot open file memfmt.txt.\n");
+ }
+
+ while (! feof(ip)) {
+ fread(page_buf, sizeof(page_buf), 1, ip); // check for error
+
+ size = sprintf(dest, "origin %lx\n", srcPhysAddr);
+ fwrite(dest, size, 1, op);
+
+ for (i = 0; i < 4096/4; i++) {
+ size = sprintf(dest, "%x\n", page_buf[i]);
+ fwrite(dest, size, 1, op);
+ }
+
+ srcPhysAddr += 4096;
+ }
+ fclose(ip);
+ fclose(op);
+}
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * Trace Capture Driver
+ *
+ * Contains code to handle trace_capture syscall, stop all cpus
+ * and dump their state, then dump all physical memeory.
+ */
+
+#include "trace_capture.h"
+
+//#define DEBUG
+
+int always_false = 0;
+
+#define BARRIER(epd, string) { \
+ printk("%s\n", string); \
+ if ((err = scif_send(epd, &control_msg, sizeof(control_msg), 1)) <= 0) { \
+ pr_crit("%s:%s:%d scif_send failed with err %ld\n", __FILE__, __FUNCTION__, __LINE__, err); \
+ goto close; \
+ } \
+ if ((err = scif_recv(epd, &control_msg, sizeof(control_msg), 1)) <= 0) { \
+ pr_crit("%s:%s:%d scif_recv failed with err %ld\n", __FILE__, __FUNCTION__, __LINE__, err); \
+ goto close; \
+ } \
+}
+
+/* SPU privileged gates (per specification) */
+#define SPU_SPBA_OFFSET 0x1000 /* offset of Privileged gates in SPU MMIO */
+#define SPU_XQ_SIZE 0x040
+#define SPU_XQ_BASE 0x080
+#define SPU_XQ_INDEX 0x0C0
+#define SPU_CR 0x100
+#define SPU_CONTROL 0x100
+#define SPU_SAMPLER_BASE 0x140
+#define SPU_ABORT 0x180
+#define SPU_ABORT_STATUS 0x1C0
+#define SPU_FLUSH 0x200
+#define SPU_FLUSH_STATUS 0x240
+#define SPU_INVALPG_4K 0x280
+#define SPU_INVALPG_64K 0x2C0
+#define SPU_INVALPG_2M 0x300
+#define SPU_EMPTY 0x340
+#define SPU_ACTIVE 0x340
+#define SPU_FULL 0x380
+#define SPU_SOFT_RESET 0x3C0
+#define SPU_PMU_EVENT_SEL 0x400
+#define SPU_CONTROL2 0x440
+#define SPU_CONTROL3 0x480
+
+#define SPU_MEM_BW_LIMIT 0x4C0 // This is 64 bit register
+
+#define SPU_TCU_CREDITS 0x700
+#define SPU_FER 0x800
+#define SPU_ALT_FER 0x840
+#define SPU_MATCH_ACTION 0x880
+#define SPU_INVAL 0xB00
+#define SPU_COUNTER0_SET 0x500
+#define SPU_COUNTER1_SET 0x540
+#define SPU_COUNTER2_SET 0x580
+#define SPU_COUNTER3_SET 0x5C0
+#define SPU_COUNTER4_SET 0x600
+#define SPU_COUNTER5_SET 0x640
+#define SPU_COUNTER6_SET 0x680
+#define SPU_COUNTER7_SET 0x6C0
+
+#define CBOX_SPU_PA_MSR 0x0000017E
+#define CBOX_SPU_SAMPLER_BIND_MSR 0x0000017F
+
+#define MSR_SF_MASK 0xc0000084 /* syscall flags mask */
+#define MSR_FSBASE 0xc0000100 /* base address of the %fs "segment" */
+#define MSR_GSBASE 0xc0000101 /* base address of the %gs "segment" */
+#define MSR_KGSBASE 0xc0000102 /* base address of the kernel %gs */
+
+// MSR's defined in the trace file sent during REQs
+// Are these all valid for L1OM??
+#define P6_CR_TSC 0x10
+#define X86_CR_APICBASE 0x1b
+#define MIC_CR_SPUBASE 0x1c
+#define IA32_CR_MISC 0x1a0
+#define WMT_CR_LASTBRANCH_0 0x1db
+#define WMT_CR_LASTBRANCH_1 0x1dc
+#define X86_CR_MTRRphysMask0 0x201
+#define X86_CR_MTRRphysMask1 0x203
+#define X86_CR_MTRRphysMask2 0x205
+#define X86_CR_MTRRphysMask3 0x207
+#define X86_CR_MTRRphysMask4 0x209
+#define X86_CR_MTRRphysMask5 0x20b
+#define X86_CR_MTRRphysMask6 0x20d
+#define X86_CR_MTRRphysMask7 0x20f
+#define IA32_CR_PAT 0x277
+#define IA32_MTRR_DEF_TYPE 0x2ff
+#define VMX_MSR_BASE 0x480
+#define VMX_MSR_BASE_PLUS_1 0x481
+#define VMX_MSR_BASE_PLUS_2 0x482
+#define VMX_MSR_BASE_PLUS_3 0x483
+#define VMX_MSR_BASE_PLUS_4 0x484
+#define VMX_MSR_BASE_PLUS_5 0x485
+#define VMX_MSR_BASE_PLUS_6 0x486
+#define VMX_MSR_BASE_PLUS_7 0x487
+#define VMX_MSR_BASE_PLUS_8 0x488
+#define VMX_MSR_BASE_PLUS_9 0x489
+#define TIME 0x4711
+#define PINFO 0x4712
+#define X86_CR_MTRRdefType 0x2ff
+#define X86_CR_MTRRcap 0xfe
+#define X86_CR_MTRRphysBase0 0x200
+#define X86_CR_MTRRphysBase1 0x202
+#define X86_CR_MTRRphysBase2 0x204
+#define X86_CR_MTRRphysBase3 0x206
+#define X86_CR_MTRRphysBase4 0x208
+#define X86_CR_MTRRphysBase5 0x20a
+#define X86_CR_MTRRphysBase6 0x20c
+#define X86_CR_MTRRphysBase7 0x20e
+#define X86_CR_MTRRfix64K_00000 0x250
+#define X86_CR_MTRRfix16K_80000 0x258
+#define X86_CR_MTRRfix16K_A0000 0x259
+#define X86_CR_MTRRfix4K_C0000 0x268
+#define X86_CR_MTRRfix4K_C8000 0x269
+#define X86_CR_MTRRfix4K_D0000 0x26a
+#define X86_CR_MTRRfix4K_D8000 0x26b
+#define X86_CR_MTRRfix4K_E0000 0x26c
+#define X86_CR_MTRRfix4K_E8000 0x26d
+#define X86_CR_MTRRfix4K_F0000 0x26e
+#define X86_CR_MTRRfix4K_F8000 0x26f
+#define P5_MC_ADDR 0x0
+#define P5_MC_TYPE 0x1
+#define MSR_TR1 0x2
+#define MSR_TR2 0x4
+#define MSR_TR3 0x5
+#define MSR_TR4 0x6
+#define MSR_TR5 0x7
+#define MSR_TR6 0x8
+#define MSR_TR7 0x9
+#define MSR_TR9 0xb
+#define MSR_TR10 0xc
+#define MSR_TR11 0xd
+#define MSR_TR12 0xe
+#define IA32_APIC_BASE 0x1b
+#define IA32_TIME_STAMP_COUNTER 0x10
+#define IA32_PerfCntr0 0x20
+#define IA32_PerfCntr1 0x21
+#define IA32_PerfCntr2 0x22
+#define IA32_PerfCntr3 0x23
+#define PerfFilteredCntr0 0x24
+#define PerfFilteredCntr1 0x25
+#define PerfFilteredCntr2 0x26
+#define PerfFilteredCntr3 0x27
+#define IA32_PerfEvtSel0 0x28
+#define IA32_PerfEvtSel1 0x29
+#define IA32_PerfEvtSel2 0x2a
+#define IA32_PerfEvtSel3 0x2b
+#define PerfFilterMask 0x2c
+#define IA32_PERF_GLOBAL_STATUS 0x2d
+#define IA32_PERF_GLOBAL_OVF_CONTROL 0x2e
+#define IA32_PERF_GLOBAL_CTRL 0x2f
+#define IA32_MCG_CTL 0x17b
+#define IA32_MC0_CTRL 0x400
+#define IA32_MC0_STAT 0x401
+#define IA32_MC0_ADDR 0x402
+#define IA32_MC0_MISC 0x403
+#define IA32_MC1_CTRL 0x404
+#define IA32_MC1_STAT 0x405
+#define IA32_MC1_ADDR 0x406
+#define IA32_MC1_MISC 0x407
+#define STAR 0xc0000081
+#define LSTAR 0xc0000082
+#define SYSCALL_FLAG_MASK 0xc0000084
+#define X86_PAT 0x277
+#define SPU_BASE 0x1C
+
+// Kernel virtual address to physical page at 0xfee03000
+// This is created by an ioremap outside of interrupt context.
+static uint8_t *spu_addr;
+
+struct mictc_seg {
+ struct desc_struct desc;
+ char zero[8];
+ u16 selector;
+ uint64_t base;
+};
+
+struct mictc_tss {
+ tss_desc desc;
+ u16 selector;
+ uint64_t base;
+};
+
+struct mictc_segment_reg
+{
+ struct mictc_seg cs;
+ struct mictc_seg ds;
+ struct mictc_seg es;
+ struct mictc_seg ss;
+ struct mictc_seg fs;
+ struct mictc_seg gs;
+ struct mictc_tss ldtr;
+ struct mictc_tss tr;
+};
+
+#define MAX_SEG_REG 8
+
+static char *SegRegNames[MAX_SEG_REG] = {"CS","DS","ES","SS", "FS","GS","LDTR","TR"};
+
+//static struct i387_fxsave_struct fpu;
+
+struct mictc_trace
+{
+ struct mictc_segment_reg segment;
+ struct vpustate_struct vpustate;
+ struct i387_fxsave_struct fpu;
+};
+
+struct mictc_trace *trace;
+
+// fxsave definition copied from fpu.c
+//#define mictc_fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr)))
+#define mictc_fxsave(addr) __asm __volatile("fxsave (%0)" : "=a" (addr) : [fx] "a" (addr))
+
+
+// Spinlock to serialize access in IPI handler
+static DEFINE_SPINLOCK(mictc_lock);
+
+// Used to count the cpus waiting
+static atomic_t cpus_stopped = ATOMIC_INIT(0);
+
+// Used to count the cpus released
+static atomic_t cpus_released = ATOMIC_INIT(0);
+
+// End points for SCIF
+//static scif_epd_t mictc_endp_cmd;
+static scif_epd_t mictc_endp_data;
+
+// SCIF ports - temp hack; move to scif.h
+#define MICTC_SCIF_PORT_DATA 300
+
+// Used to prevent concurent access into the same device .
+static int Device_Open = 0;
+
+#define PS_BUF_SIZE 150
+//static char print_string_buf[PS_BUF_SIZE] = "";
+
+#define print_str(fmt, ...) \
+{ \
+ snprintf(print_string_buf, PS_BUF_SIZE, fmt, ##__VA_ARGS__); \
+ print_string(print_string_buf); \
+}
+
+//#define printk(fmt, ...) print_str(fmt, ##__VA_ARGS__)
+//#undef pr_crit
+//#define pr_crit(fmt, ...) print_str(fmt, ##__VA_ARGS__)
+
+// Interrupts off / on
+#define cli __asm (" cli\n")
+#define sti __asm (" sti\n")
+
+// Debug code to display low 16 bits of eflags register.
+#define print_eflags \
+ {unsigned long kernel_eflags; \
+ raw_local_save_flags(kernel_eflags); \
+ printk("%s:%d eflags %lx\n", __FUNCTION__, __LINE__, kernel_eflags); \
+ }
+
+
+// Find another definition of this in some .h file
+static __inline void
+mictc_cpuid(u_int ax, u_int *p)
+{
+ __asm __volatile("cpuid"
+ : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3])
+ : "0" (ax));
+}
+
+static inline
+uint32_t get_dr(int regno)
+{
+ unsigned long val = 0; /* Damn you, gcc! */
+
+ switch (regno) {
+ case 0:
+ asm("mov %%db0, %0" :"=r" (val));
+ break;
+ case 1:
+ asm("mov %%db1, %0" :"=r" (val));
+ break;
+ case 2:
+ asm("mov %%db2, %0" :"=r" (val));
+ break;
+ case 3:
+ asm("mov %%db3, %0" :"=r" (val));
+ break;
+ case 4:
+ asm("mov %%db4, %0" :"=r" (val));
+ break;
+ case 5:
+ asm("mov %%db5, %0" :"=r" (val));
+ break;
+ case 6:
+ asm("mov %%db6, %0" :"=r" (val));
+ break;
+ case 7:
+ asm("mov %%db7, %0" :"=r" (val));
+ break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+
+static inline void mictc_store_ldt(u16 *dtr)
+{
+ asm volatile("sldt %0":"=m" (*dtr));
+}
+
+
+static inline void mictc_store_tr(u16 *dtr)
+{
+ asm volatile("str %0":"=m" (*dtr));
+}
+
+
+static inline void read_gdt_entry(struct desc_struct *gdt, int entry,
+ void *desc, int type)
+{
+ unsigned int size;
+ switch (type) {
+ case DESC_TSS:
+ size = sizeof(tss_desc);
+ break;
+ case DESC_LDT:
+ size = sizeof(ldt_desc);
+ break;
+ default:
+ size = sizeof(struct desc_struct);
+ break;
+ }
+ memcpy(desc, &gdt[entry], size);
+#if 0 // Helpful for debug
+ { u64 *p = (u64 *)&gdt[entry];
+ printk("GDT[entry] = %p %llx %llx\n", &gdt[entry], p[0], p[1]);
+ }
+#endif
+}
+
+
+static inline void __get_tss_desc(unsigned cpu, unsigned int entry, void *dest)
+{
+ struct desc_struct *d = get_cpu_gdt_table(cpu);
+ read_gdt_entry(d, entry, dest, DESC_TSS);
+}
+
+#define get_tss_desc(cpu, addr) __get_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+
+
+static inline void __get_seg_desc(unsigned cpu, unsigned int entry, void *dest)
+{
+ struct desc_struct *d = get_cpu_gdt_table(cpu);
+
+ read_gdt_entry(d, entry, dest, 0);
+}
+
+#define get_seg_desc(cpu, seg, addr) __get_seg_desc(cpu, ((seg & 0xffff) >> 3), addr)
+
+// Redefine rdmsr to work like BSD.
+
+//#undef rdmsr
+//#define rdmsr(msr) tc_msr((msr))
+
+static inline
+uint64_t tc_rdmsr(uint32_t msrid)
+{
+ uint32_t lower, upper;
+ rdmsr(msrid, lower, upper);
+ return (uint64_t)upper << 32 | lower;
+}
+
+// Number of Retries before it is assumed that the Host will not respond
+#define TRACE_CAPTURE_TIMEOUT 50000000
+
+static void *g_traceBufferAllocated;
+
+// Global variable used by initiator to wait for everyone to complete trace captures
+//static volatile u32 g_smpTraceCaptureWait;
+
+// Global variable to keep track of how much data we are writing to the shared buffer
+// with the Host.
+static volatile u64 g_sizeXferred = 0;
+
+static s64 g_triggerFound = -1;
+
+static volatile u64 *g_traceBufferStatusOffset = NULL;
+static volatile u64 *g_traceBufferSizeOffset = NULL;
+static volatile u32 *g_traceBufferDataOffset = 0;
+static volatile u32 *g_traceBufferTriggerOffset = NULL;
+
+// This is an array of trigger numbers. The value TRACE_EOL is ignored.
+static u32 g_traceTriggers[TRACE_TRIGGER_MAX];
+static u32 g_traceCurrentTrigger;
+
+static long scif_offset_xml;
+//static long scif_offset_xml_dst;
+static long scif_offset_mem;
+static long scif_offset_dst;
+
+#if MIC_TRACE_CAPTURE_MEMORY_TEST
+static volatile u64 *g_traceBufferChecksumOffset = NULL;
+
+// The maximum size allowed for a DMA transfer is 1MB - 4K. The size of this array
+// is 1MB to allow this to be used as the dst memory while dumping entire GDDR
+// For Debug purposes only.
+static u32 g_dstMemoryDump[4096/sizeof(u32)] __attribute__ ((aligned(4096)));
+#endif
+
+#define TRACE_SPRINTF(...) \
+ (g_sizeXferred += sprintf(((char*)g_traceBufferDataOffset + g_sizeXferred), __VA_ARGS__))
+
+#define ADD_SPU_REG_TO_HEADER(x) \
+ TRACE_SPRINTF("\t\t\t\t<reg offset=\"0x%x\">\n\t\t\t\t\t<name>%s</name>\n\t\t\t\t</reg>\n", (x), #x)
+
+#define ADD_MSR_TO_HEADER(x) \
+ TRACE_SPRINTF("\t\t\t\t<reg addr=\"0x%x\"/>\n", (x))
+
+#define TRACE_SPRINTF_MSR(x) \
+ TRACE_SPRINTF("\t\t\t\t<reg addr=\"0x%x\">0x%llx</reg>\n", (x), tc_rdmsr((x)))
+
+#define TRACE_SPRINTF_SPU(x) \
+ TRACE_SPRINTF("\t\t\t\t<reg offset=\"0x%x\">0x%llx</reg>\n", (x), *(volatile u64*)((u8*)spu_addr + (x)))
+
+#define TRACE_SPRINTF_VECTOR(x, vpu) \
+ PrintVector((u8*)&(vpu), (x))
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: mictc_trace_capture_prep_SPU_header
+//
+// DESCRIPTION:
+// Perform all the tasks related to preparing the SPU Trace Header
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+mictc_trace_capture_prep_SPU_header(void)
+{
+ TRACE_SPRINTF("\t\t\t<spu>\n");
+ ADD_SPU_REG_TO_HEADER(SPU_XQ_SIZE);
+ ADD_SPU_REG_TO_HEADER(SPU_XQ_BASE);
+ ADD_SPU_REG_TO_HEADER(SPU_XQ_INDEX);
+ ADD_SPU_REG_TO_HEADER(SPU_CONTROL);
+ ADD_SPU_REG_TO_HEADER(SPU_SAMPLER_BASE);
+ ADD_SPU_REG_TO_HEADER(SPU_PMU_EVENT_SEL);
+ ADD_SPU_REG_TO_HEADER(SPU_CONTROL2);
+ ADD_SPU_REG_TO_HEADER(SPU_CONTROL3);
+ TRACE_SPRINTF("\t\t\t</spu>\n");
+}
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: mictc_trace_capture_prep_cpuid_header
+//
+// DESCRIPTION:
+// Perform all the tasks related to preparing the CPUID Trace Header
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+mictc_trace_capture_prep_cpuid_header(void)
+{
+ u_int regs[4];
+ int i =0;
+ TRACE_SPRINTF("\t\t\t<cpuid>\n");
+ for (i = 0; i < 0x4; i++)
+ {
+ mictc_cpuid(i, regs);
+ TRACE_SPRINTF("\t\t\t\t<reg eax=\"0x%x\">0x%x-0x%x-0x%x-0x%x</reg>\n",
+ i, regs[0], regs[1], regs[2], regs[3]);
+ }
+ TRACE_SPRINTF("\t\t\t</cpuid>\n");
+}
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: mictc_trace_capture_prep_msr_header
+//
+// DESCRIPTION:
+// Perform all the tasks related to preparing the MSR Trace Header
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+mictc_trace_capture_prep_msr_header(void)
+{
+ TRACE_SPRINTF("\t\t\t<msr>\n");
+ ADD_MSR_TO_HEADER(P6_CR_TSC);
+ ADD_MSR_TO_HEADER(X86_CR_APICBASE);
+ ADD_MSR_TO_HEADER(CBOX_SPU_PA_MSR);
+ ADD_MSR_TO_HEADER(SPU_BASE);
+ ADD_MSR_TO_HEADER(CBOX_SPU_SAMPLER_BIND_MSR);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask0);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask1);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask2);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask3);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask4);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask5);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask6);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysMask7);
+ ADD_MSR_TO_HEADER(MSR_EFER);
+ ADD_MSR_TO_HEADER(MSR_SF_MASK);
+ ADD_MSR_TO_HEADER(MSR_FSBASE);
+ ADD_MSR_TO_HEADER(MSR_GSBASE);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRdefType);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRcap);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase2);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase0);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase1);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase3);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase4);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase5);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase6);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRphysBase7);
+ ADD_MSR_TO_HEADER(STAR);
+ ADD_MSR_TO_HEADER(LSTAR);
+ ADD_MSR_TO_HEADER(MSR_KGSBASE);
+
+ // The following MSR's are currently ifdef'd out
+ // because LarrySim barfs on these.
+ // We might need these later.
+#if 0
+ ADD_MSR_TO_HEADER(X86_CR_MTRRfix64K_00000);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRfix16K_80000);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRfix16K_A0000);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_C0000);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_C8000);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_D0000);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_D8000);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_E0000);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_E8000);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_F0000);
+ ADD_MSR_TO_HEADER(X86_CR_MTRRfix4K_F8000);
+ ADD_MSR_TO_HEADER(P5_MC_ADDR);
+ ADD_MSR_TO_HEADER(P5_MC_TYPE);
+ ADD_MSR_TO_HEADER(MSR_TR1);
+ ADD_MSR_TO_HEADER(MSR_TR2);
+ ADD_MSR_TO_HEADER(MSR_TR3);
+ ADD_MSR_TO_HEADER(MSR_TR4);
+ ADD_MSR_TO_HEADER(MSR_TR5);
+ ADD_MSR_TO_HEADER(MSR_TR6);
+ ADD_MSR_TO_HEADER(MSR_TR7);
+ ADD_MSR_TO_HEADER(MSR_TR9);
+ ADD_MSR_TO_HEADER(MSR_TR10);
+ ADD_MSR_TO_HEADER(MSR_TR11);
+ ADD_MSR_TO_HEADER(MSR_TR12);
+ ADD_MSR_TO_HEADER(IA32_APIC_BASE);
+ ADD_MSR_TO_HEADER(IA32_TIME_STAMP_COUNTER);
+ ADD_MSR_TO_HEADER(IA32_PerfCntr0);
+ ADD_MSR_TO_HEADER(IA32_PerfCntr1);
+ ADD_MSR_TO_HEADER(IA32_PerfCntr2);
+ ADD_MSR_TO_HEADER(IA32_PerfCntr3);
+ ADD_MSR_TO_HEADER(PerfFilteredCntr0);
+ ADD_MSR_TO_HEADER(PerfFilteredCntr1);
+ ADD_MSR_TO_HEADER(PerfFilteredCntr2);
+ ADD_MSR_TO_HEADER(PerfFilteredCntr3);
+ ADD_MSR_TO_HEADER(IA32_PerfEvtSel0);
+ ADD_MSR_TO_HEADER(IA32_PerfEvtSel1);
+ ADD_MSR_TO_HEADER(IA32_PerfEvtSel2);
+ ADD_MSR_TO_HEADER(IA32_PerfEvtSel3);
+ ADD_MSR_TO_HEADER(PerfFilterMask);
+ ADD_MSR_TO_HEADER(IA32_PERF_GLOBAL_STATUS);
+ ADD_MSR_TO_HEADER(IA32_PERF_GLOBAL_OVF_CONTROL);
+ ADD_MSR_TO_HEADER(IA32_PERF_GLOBAL_CTRL);
+ ADD_MSR_TO_HEADER(IA32_MCG_CTL);
+ ADD_MSR_TO_HEADER(IA32_MC0_CTRL);
+ ADD_MSR_TO_HEADER(IA32_MC0_STAT);
+ ADD_MSR_TO_HEADER(IA32_MC0_ADDR);
+ ADD_MSR_TO_HEADER(IA32_MC0_MISC);
+ ADD_MSR_TO_HEADER(IA32_MC1_CTRL);
+ ADD_MSR_TO_HEADER(IA32_MC1_STAT);
+ ADD_MSR_TO_HEADER(IA32_MC1_ADDR);
+ ADD_MSR_TO_HEADER(IA32_MC1_MISC);
+ ADD_MSR_TO_HEADER(SYSCALL_FLAG_MASK);
+ ADD_MSR_TO_HEADER(X86_PAT);
+#endif
+ TRACE_SPRINTF("\t\t\t</msr>\n");
+}
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: mictc_prep_header
+//
+// DESCRIPTION:
+// Perform all the tasks related to preparing the Trace Header
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+mictc_prep_header(void)
+{
+ int i;
+
+ TRACE_SPRINTF("<?xml version=\"1.0\" standalone=\"yes\"?>\n");
+ TRACE_SPRINTF("<arch_data>\n");
+ TRACE_SPRINTF("<!-- The format of this file is defined in https://cpu-sim.intel.com/twiki/bin/view/CpuSim/TraceFileFormats. -->\n");
+ TRACE_SPRINTF("\t<header>\n");
+ TRACE_SPRINTF("\t\t<format_version>1.0</format_version>\n");
+ TRACE_SPRINTF("\t\t<creation_date>Nov 19 2009</creation_date>\n");
+ TRACE_SPRINTF("\t\t<arch_xml_ver>1.1</arch_xml_ver>\n");
+ TRACE_SPRINTF("\t\t<arch_xml_date>Oct 21 2009</arch_xml_date>\n");
+ TRACE_SPRINTF("\t\t<created_by>archlib</created_by>\n");
+ TRACE_SPRINTF("\t\t<comment>Warnings! This is based on the state available in archlib.</comment>\n");
+ TRACE_SPRINTF("\t\t<comment> This state dump is primarily good for capturing frequently used architectural register state.</comment>\n");
+ TRACE_SPRINTF("\t\t<comment> Support for CPUId, MSRs, APIC, and x87 state is currently incomplete.</comment>\n");
+ TRACE_SPRINTF("\t\t<comment> There is no support for state not specifically modeled in archlib.</comment>\n");
+ TRACE_SPRINTF("\t\t<comment> Have also noticed inconsistencies in the final value of the RFLAGS reg.</comment>\n");
+ if (g_triggerFound != -1)
+ {
+ TRACE_SPRINTF("\t\t<comment> This capture is generated for HOST BASED TRIGGER # %lld.</comment>\n", g_triggerFound);
+ g_triggerFound = -1;
+ }
+ TRACE_SPRINTF("\t</header>\n");
+ TRACE_SPRINTF("\t<cpu_definition>\n");
+ TRACE_SPRINTF("\t\t<num_cpus>%d</num_cpus>\n", num_online_cpus());
+ TRACE_SPRINTF("<!-- the number of \"cpu\" definitions must correspond to the \"num_cpus\" data item -->\n");
+
+ for (i = 0; i < num_online_cpus(); i++)
+ {
+ TRACE_SPRINTF("\t\t<cpu num=\"%d\">\n", i);
+// SPU is not supported in Linux
+ if (always_false) mictc_trace_capture_prep_SPU_header();
+ mictc_trace_capture_prep_cpuid_header();
+ mictc_trace_capture_prep_msr_header();
+ TRACE_SPRINTF("\t\t</cpu>\n");
+ }
+
+ TRACE_SPRINTF("\t</cpu_definition>\n");
+ TRACE_SPRINTF("\t<platform_definition>\n");
+ TRACE_SPRINTF("\t\t<physical_memory/>\n");
+ TRACE_SPRINTF("\t</platform_definition>\n");
+ TRACE_SPRINTF("\t<cpu_state>\n");
+ TRACE_SPRINTF("<!-- the number of \"cpu\" definitions must correspond to the \"num_cpus\" data item -->\n");
+}
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: mictc_capture_general_purpose_reg
+//
+// DESCRIPTION:
+// Capture all general purpose registers.
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+mictc_capture_general_purpose_reg(struct pt_regs *regs)
+{
+ // printk("starting reg dump regs=%llx\n", (uint64_t)regs);
+
+ if (!regs) {
+ printk("Null pointer found. cpu %d %s\n", smp_processor_id(), current->comm);
+ return;
+ }
+
+ TRACE_SPRINTF("\t\t\t<general>\n");
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"RAX\">0x%lx</reg>\n", regs->ax);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"RBX\">0x%lx</reg>\n", regs->bx);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"RCX\">0x%lx</reg>\n", regs->cx);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"RDX\">0x%lx</reg>\n", regs->dx);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"RBP\">0x%lx</reg>\n", regs->bp);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"RSP\">0x%lx</reg>\n", regs->sp);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"RSI\">0x%lx</reg>\n", regs->si);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"RDI\">0x%lx</reg>\n", regs->di);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"R8\">0x%lx</reg>\n", regs->r8);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"R9\">0x%lx</reg>\n", regs->r9);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"R10\">0x%lx</reg>\n", regs->r10);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"R11\">0x%lx</reg>\n", regs->r11);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"R12\">0x%lx</reg>\n", regs->r12);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"R13\">0x%lx</reg>\n", regs->r13);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"R14\">0x%lx</reg>\n", regs->r14);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"R15\">0x%lx</reg>\n", regs->r15);
+// In cases where a CPU is halted and is woken up from halt by the trace capture IPI
+// we want to report the RIP as the one pointing to the halt instruction itself
+// and not the one on the trap frame. This is to avoid the condition where the simulator-run
+// for these halted CPUs ends up running extra cycles (before going back idle)
+// which would not happen under actual conditions. Problem reported by Jason S.
+//// if(regs->tf_rip == (register_t)ExitIdle)
+//// TRACE_SPRINTF("\t\t\t\t<reg name=\"RIP\">0x%lx</reg>\n", regs->ip-1);
+//// else
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"RIP\">0x%lx</reg>\n", regs->ip);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"RFLAGS\">0x%lx</reg>\n", regs->flags);
+ TRACE_SPRINTF("\t\t\t</general>\n");
+
+ // printk("ending reg dump\n");
+}
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: mictc_capture_segment_reg
+//
+// DESCRIPTION:
+// Capture all segment registers.
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+mictc_capture_segment_reg(struct mictc_segment_reg *segment, struct pt_regs *regs)
+{
+ int i, v;
+ struct desc_ptr gdtr;
+ struct desc_ptr idtr;
+ struct mictc_seg *segreg;
+
+// printk("Segment registers on cpu %d\n", smp_processor_id());
+
+ // This is only useful during initial development.
+ if (!regs) {
+ printk("Null pointer found. cpu %d %s\n", smp_processor_id(), current->comm);
+ return;
+ }
+
+ segment->cs.selector = (u16)regs->cs;
+ segment->ss.selector = (u16)regs->ss;
+#if 0
+ if (ISPL(regs->tf_cs) == SEL_KPL && curthread->td_pcb->pcb_ds == 0x0) {
+ // Specifically required for kernel IDLE thread
+ segment->ds = 0x10;
+ segment->es = 0x10;
+ segment->fs = 0x10;
+ segment->gs = 0x10;
+ } else {
+#endif
+ asm("movl %%ds,%0" : "=r" (v)); segment->ds.selector = v;
+ asm("movl %%es,%0" : "=r" (v)); segment->es.selector = v;
+ segment->fs.selector = current->thread.fs;
+ segment->gs.selector = current->thread.gs;
+// }
+ mictc_store_tr(&(segment->tr.selector));
+ get_tss_desc(smp_processor_id(), &(segment->tr.desc));
+ store_gdt(&gdtr);
+ store_idt(&idtr);
+ mictc_store_ldt(&(segment->ldtr.selector));
+ // LDT is not used, so zeros will be printed.
+
+ TRACE_SPRINTF("\t\t\t<segment>\n");
+ segreg = (struct mictc_seg *)&(segment->cs);
+
+ for(i=0; i < MAX_SEG_REG; i++) {
+ if (strcmp(SegRegNames[i], "GS") == 0) {
+ segreg->base = tc_rdmsr(MSR_KGSBASE);
+ }
+ if (strcmp(SegRegNames[i], "FS") == 0) {
+ segreg->base = tc_rdmsr(MSR_FSBASE);
+ }
+
+ // Fill in the segment descriptor for cs to gs
+ if (i <= 5) {
+ get_seg_desc(smp_processor_id(), segreg->selector, &(segreg->desc));
+ }
+
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"%s\">\n",SegRegNames[i]);
+ if (i > 5) { // LDT and TSS
+ struct mictc_tss *segreg1 =(struct mictc_tss *)segreg;
+
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"base\">0x%llx</attr>\n", ((uint64_t)segreg1->desc.base3 << 32) | (uint64_t)((segreg1->desc.base2 << 24) | (segreg1->desc.base1 << 16) | segreg1->desc.base0));
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"limit\">0x%x</attr>\n", (segreg1->desc.limit1 << 16) | segreg1->desc.limit0);
+ TRACE_SPRINTF("\t\t\t\t\t<selector>0x%x</selector>\n", segreg1->selector);
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"G\">0x%x</attr>\n", segreg1->desc.g);
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"DB\">0x%x</attr>\n", 0); // double word of base and limit
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"L\">0x%x</attr>\n", 0);
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"AVL\">0x0</attr>\n");//AVL bit not populated in the gdt[] array
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"P\">0x%x</attr>\n", segreg1->desc.p);
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"DPL\">0x%x</attr>\n", segreg1->desc.dpl);
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"S\">0x%x</attr>\n", segreg1->desc.type & 0x10 ? 1 : 0); //The S bit (descriptor type) is clubbed along with the ssd_type element.
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"TYPE\">0x%x</attr>\n", (segreg1->desc.type & 0xf));
+ } else {
+ if (segreg->base) {
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"base\">0x%llx</attr>\n", segreg->base);
+ } else {
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"base\">0x%x</attr>\n", (segreg->desc.base2 << 24) | (segreg->desc.base1 << 16) |segreg->desc.base0);
+ }
+ if (segreg->desc.l) segreg->desc.a = 0;
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"limit\">0x%x</attr>\n", (segreg->desc.limit << 16) | segreg->desc.limit0);
+ TRACE_SPRINTF("\t\t\t\t\t<selector>0x%x</selector>\n", segreg->selector);
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"G\">0x%x</attr>\n", segreg->desc.g);
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"DB\">0x%x</attr>\n", segreg->desc.a & 1); // double word of base and limit
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"L\">0x%x</attr>\n", segreg->desc.l);
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"AVL\">0x0</attr>\n");//AVL bit not populated in the gdt[] array
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"P\">0x%x</attr>\n", segreg->desc.p);
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"DPL\">0x%x</attr>\n", segreg->desc.dpl);
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"S\">0x%x</attr>\n", segreg->desc.type & 0x10 ? 1 : 0); //The S bit (descriptor type) is clubbed along with the ssd_type element.
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"TYPE\">0x%x</attr>\n", (segreg->desc.type & 0xf));
+ }
+ TRACE_SPRINTF("\t\t\t\t</reg>\n");
+ segreg++;
+ }
+
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"GDTR\">\n");
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"base\">0x%lx</attr>\n", gdtr.address);
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"limit\">0x%x</attr>\n", gdtr.size);
+ TRACE_SPRINTF("\t\t\t\t</reg>\n");
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"IDTR\">\n");
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"base\">0x%lx</attr>\n", idtr.address);
+ TRACE_SPRINTF("\t\t\t\t\t<attr name=\"limit\">0x%x</attr>\n", idtr.size);
+ TRACE_SPRINTF("\t\t\t\t</reg>\n");
+
+ TRACE_SPRINTF("\t\t\t</segment>\n");
+
+// printk("End of mictc_capture_segment_reg\n");
+
+}
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: mictc_capture_debug_reg
+//
+// DESCRIPTION:
+// Capture all debug registers.
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+mictc_capture_debug_reg(void)
+{
+ TRACE_SPRINTF("\t\t\t<debug>\n");
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"DR0\">0x%x</reg>\n", get_dr(0));
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"DR1\">0x%x</reg>\n", get_dr(1));
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"DR2\">0x%x</reg>\n", get_dr(2));
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"DR3\">0x%x</reg>\n", get_dr(3));
+// These don't exist.
+// TRACE_SPRINTF("\t\t\t\t<reg name=\"DR4\">0x%x</reg>\n", get_dr(4));
+// TRACE_SPRINTF("\t\t\t\t<reg name=\"DR5\">0x%x</reg>\n", get_dr(5));
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"DR6\">0x%x</reg>\n", get_dr(6));
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"DR7\">0x%x</reg>\n", get_dr(7));
+ TRACE_SPRINTF("\t\t\t</debug>\n");
+}
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: mictc_capture_control_reg
+//
+// DESCRIPTION:
+// Capture all control registers.
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+mictc_capture_control_reg(void)
+{
+ TRACE_SPRINTF("\t\t\t<control>\n");
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"CR0\">0x%lx</reg>\n", (read_cr0()) & 0xffffffff);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"CR2\">0x%lx</reg>\n", read_cr2());
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"CR3\">0x%lx</reg>\n", read_cr3());
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"CR4\">0x%lx</reg>\n", (read_cr4()) & 0xffffffff);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"CR8\">0x%lx</reg>\n", read_cr8());
+ TRACE_SPRINTF("\t\t\t</control>\n");
+}
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: mictc_capture_SPU_reg
+//
+// DESCRIPTION:
+// Capture all SPU registers.
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+mictc_capture_SPU_reg(void)
+{
+#if 0
+ // FIXME - The SPU is not setup currently in Linux
+
+ TRACE_SPRINTF("\t\t\t<spu>\n");
+ TRACE_SPRINTF_SPU(SPU_XQ_SIZE);
+ TRACE_SPRINTF_SPU(SPU_XQ_BASE);
+ TRACE_SPRINTF_SPU(SPU_XQ_INDEX);
+ TRACE_SPRINTF_SPU(SPU_CONTROL);
+ TRACE_SPRINTF_SPU(SPU_SAMPLER_BASE);
+ TRACE_SPRINTF_SPU(SPU_PMU_EVENT_SEL);
+ TRACE_SPRINTF_SPU(SPU_CONTROL2);
+ TRACE_SPRINTF_SPU(SPU_CONTROL3);
+ TRACE_SPRINTF("\t\t\t</spu>\n");
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: PrintVector
+//
+// DESCRIPTION:
+// Prints _m512 vectors
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+PrintVector(u8 *res_mem, int reg_num)
+{
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"V%d\">0x"
+ "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x"
+ "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x"
+ "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x"
+ "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x</reg>\n",
+ reg_num,
+ res_mem[63], res_mem[62], res_mem[61], res_mem[60], res_mem[59], res_mem[58], res_mem[57], res_mem[56],
+ res_mem[55], res_mem[54], res_mem[53], res_mem[52], res_mem[51], res_mem[50], res_mem[49], res_mem[48],
+ res_mem[47], res_mem[46], res_mem[45], res_mem[44], res_mem[43], res_mem[42], res_mem[41], res_mem[40],
+ res_mem[39], res_mem[38], res_mem[37], res_mem[36], res_mem[35], res_mem[34], res_mem[33], res_mem[32],
+ res_mem[31], res_mem[30], res_mem[29], res_mem[28], res_mem[27], res_mem[26], res_mem[25], res_mem[24],
+ res_mem[23], res_mem[22], res_mem[21], res_mem[20], res_mem[19], res_mem[18], res_mem[17], res_mem[16],
+ res_mem[15], res_mem[14], res_mem[13], res_mem[12], res_mem[11], res_mem[10], res_mem[9], res_mem[8],
+ res_mem[7], res_mem[6], res_mem[5], res_mem[4], res_mem[3], res_mem[2], res_mem[1], res_mem[0]);
+}
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: PrintFPRegister
+//
+// DESCRIPTION:
+// Prints 10 byte FP register contents
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+PrintFPRegister(u8 *res_mem, int reg_num)
+{
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"FR%d\">0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x</reg>\n",
+ reg_num,
+ res_mem[9],
+ res_mem[8],
+ res_mem[7],
+ res_mem[6],
+ res_mem[5],
+ res_mem[4],
+ res_mem[3],
+ res_mem[2],
+ res_mem[1],
+ res_mem[0]);
+}
+
+
+// VPU Instructions
+
+#ifdef CONFIG_ML1OM
+#define VSTORED_DISP32_EAX(v, disp32) " vstored %%v" #v "," #disp32 "(%%rax)\n"
+
+#define VKSTORE_DISP32_EAX(k, disp32) \
+ " vkmov %%k" #k ",%%ebx\n" \
+ " movw %%bx, " #disp32 "(%%rax)\n"
+
+#define STVXCSR_DISP32_EAX(disp32) " stvxcsr " #disp32 "(%%rax)\n"
+
+#else
+// For K1OM
+#define VSTORED_DISP32_EAX(v, disp32) " vpackstorelps %%zmm" #v "," #disp32 "(%%rax)\n"
+
+#define VKSTORE_DISP32_EAX(k, disp32) \
+ " kmov %%k" #k ",%%ebx\n" \
+ " movw %%bx, " #disp32 "(%%rax)\n"
+
+#define STVXCSR_DISP32_EAX(disp32) " stmxcsr " #disp32 "(%%rax)\n"
+#endif
+
+static inline void save_vpu(struct vpustate_struct *vpustate)
+{
+ asm volatile(
+ VSTORED_DISP32_EAX(0, 0x00)
+ VSTORED_DISP32_EAX(1, 0x40)
+ VSTORED_DISP32_EAX(2, 0x80)
+ VSTORED_DISP32_EAX(3, 0xc0)
+ VSTORED_DISP32_EAX(4, 0x100)
+ VSTORED_DISP32_EAX(5, 0x140)
+ VSTORED_DISP32_EAX(6, 0x180)
+ VSTORED_DISP32_EAX(7, 0x1c0)
+ VSTORED_DISP32_EAX(8, 0x200)
+ VSTORED_DISP32_EAX(9, 0x240)
+ VSTORED_DISP32_EAX(10, 0x280)
+ VSTORED_DISP32_EAX(11, 0x2c0)
+ VSTORED_DISP32_EAX(12, 0x300)
+ VSTORED_DISP32_EAX(13, 0x340)
+ VSTORED_DISP32_EAX(14, 0x380)
+ VSTORED_DISP32_EAX(15, 0x3c0)
+ VSTORED_DISP32_EAX(16, 0x400)
+ VSTORED_DISP32_EAX(17, 0x440)
+ VSTORED_DISP32_EAX(18, 0x480)
+ VSTORED_DISP32_EAX(19, 0x4c0)
+ VSTORED_DISP32_EAX(20, 0x500)
+ VSTORED_DISP32_EAX(21, 0x540)
+ VSTORED_DISP32_EAX(22, 0x580)
+ VSTORED_DISP32_EAX(23, 0x5c0)
+ VSTORED_DISP32_EAX(24, 0x600)
+ VSTORED_DISP32_EAX(25, 0x640)
+ VSTORED_DISP32_EAX(26, 0x680)
+ VSTORED_DISP32_EAX(27, 0x6c0)
+ VSTORED_DISP32_EAX(28, 0x700)
+ VSTORED_DISP32_EAX(29, 0x740)
+ VSTORED_DISP32_EAX(30, 0x780)
+ VSTORED_DISP32_EAX(31, 0x7c0)
+ VKSTORE_DISP32_EAX(0, 0x800)
+ VKSTORE_DISP32_EAX(1, 0x802)
+ VKSTORE_DISP32_EAX(2, 0x804)
+ VKSTORE_DISP32_EAX(3, 0x806)
+ VKSTORE_DISP32_EAX(4, 0x808)
+ VKSTORE_DISP32_EAX(5, 0x80a)
+ VKSTORE_DISP32_EAX(6, 0x80c)
+ VKSTORE_DISP32_EAX(7, 0x80e)
+ STVXCSR_DISP32_EAX(0x810)
+ : "=m" (vpustate) : [fx] "a" (vpustate) : "ebx"
+ );
+}
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: mictc_capture_vector_reg
+//
+// DESCRIPTION:
+// Capture all vector registers.
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+mictc_capture_vector_reg(struct vpustate_struct *vpustate)
+{
+ // printk("vpustate = %p\n", vpustate);
+
+ save_vpu(vpustate);
+
+ TRACE_SPRINTF("\t\t\t<vpu>\n");
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"K0\">0x%x</reg>\n", vpustate->k[0]);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"K1\">0x%x</reg>\n", vpustate->k[1]);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"K2\">0x%x</reg>\n", vpustate->k[2]);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"K3\">0x%x</reg>\n", vpustate->k[3]);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"K4\">0x%x</reg>\n", vpustate->k[4]);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"K5\">0x%x</reg>\n", vpustate->k[5]);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"K6\">0x%x</reg>\n", vpustate->k[6]);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"K7\">0x%x</reg>\n", vpustate->k[7]);
+ TRACE_SPRINTF_VECTOR(0, vpustate->vector_space[0]);
+ TRACE_SPRINTF_VECTOR(1, vpustate->vector_space[16]);
+ TRACE_SPRINTF_VECTOR(2, vpustate->vector_space[32]);
+ TRACE_SPRINTF_VECTOR(3, vpustate->vector_space[48]);
+ TRACE_SPRINTF_VECTOR(4, vpustate->vector_space[64]);
+ TRACE_SPRINTF_VECTOR(5, vpustate->vector_space[80]);
+ TRACE_SPRINTF_VECTOR(6, vpustate->vector_space[96]);
+ TRACE_SPRINTF_VECTOR(7, vpustate->vector_space[112]);
+ TRACE_SPRINTF_VECTOR(8, vpustate->vector_space[128]);
+ TRACE_SPRINTF_VECTOR(9, vpustate->vector_space[144]);
+ TRACE_SPRINTF_VECTOR(10, vpustate->vector_space[160]);
+ TRACE_SPRINTF_VECTOR(11, vpustate->vector_space[176]);
+ TRACE_SPRINTF_VECTOR(12, vpustate->vector_space[192]);
+ TRACE_SPRINTF_VECTOR(13, vpustate->vector_space[208]);
+ TRACE_SPRINTF_VECTOR(14, vpustate->vector_space[224]);
+ TRACE_SPRINTF_VECTOR(15, vpustate->vector_space[240]);
+ TRACE_SPRINTF_VECTOR(16, vpustate->vector_space[256]);
+ TRACE_SPRINTF_VECTOR(17, vpustate->vector_space[272]);
+ TRACE_SPRINTF_VECTOR(18, vpustate->vector_space[288]);
+ TRACE_SPRINTF_VECTOR(19, vpustate->vector_space[304]);
+ TRACE_SPRINTF_VECTOR(20, vpustate->vector_space[320]);
+ TRACE_SPRINTF_VECTOR(21, vpustate->vector_space[336]);
+ TRACE_SPRINTF_VECTOR(22, vpustate->vector_space[352]);
+ TRACE_SPRINTF_VECTOR(23, vpustate->vector_space[368]);
+ TRACE_SPRINTF_VECTOR(24, vpustate->vector_space[384]);
+ TRACE_SPRINTF_VECTOR(25, vpustate->vector_space[400]);
+ TRACE_SPRINTF_VECTOR(26, vpustate->vector_space[416]);
+ TRACE_SPRINTF_VECTOR(27, vpustate->vector_space[432]);
+ TRACE_SPRINTF_VECTOR(28, vpustate->vector_space[448]);
+ TRACE_SPRINTF_VECTOR(29, vpustate->vector_space[464]);
+ TRACE_SPRINTF_VECTOR(30, vpustate->vector_space[480]);
+ TRACE_SPRINTF_VECTOR(31, vpustate->vector_space[496]);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"VXCSR\">0x%x</reg>\n", vpustate->vxcsr);
+ TRACE_SPRINTF("\t\t\t</vpu>\n");
+}
+
+//------------------------------------------------------------------------------
+// FUNCTION: mictc_capture_FPU_reg
+//
+// DESCRIPTION:
+// Capture all FPU registers.
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+mictc_capture_FPU_reg(struct i387_fxsave_struct *fpu)
+{
+
+/*
+ Get FPU contents from the registers instead of the PCB.
+ fxsave on L1OM saves only the x87 FPU registers and not the SSE2 and MMX registers.
+ For format of the data below refer Intel 64 and IA-32 Arch. SDM Vol 2A Instr Set Ref A-M
+ tables 3-59 & 3-60.
+*/
+ mictc_fxsave(fpu);
+
+ TRACE_SPRINTF("\t\t\t<fp>\n");
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"CW\">0x%x</reg>\n", fpu->cwd);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"SW\">0x%x</reg>\n", fpu->swd);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"TW\">0x%x</reg>\n", (fpu->twd));
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"FCS\">0x%x</reg>\n", (fpu->fcs & 0xffff));
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"OPCODE\">0x%x</reg>\n", fpu->fop);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"FDS\">0x%x</reg>\n", (fpu->fos & 0xffff));
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"FIP\">0x%x</reg>\n", fpu->fip);
+ TRACE_SPRINTF("\t\t\t\t<reg name=\"DATAOP\">0x%x</reg>\n", (fpu->foo));
+ PrintFPRegister((u8 *)&(fpu->st_space[0]), 0);
+ PrintFPRegister((u8 *)&(fpu->st_space[4]), 1);
+ PrintFPRegister((u8 *)&(fpu->st_space[8]), 2);
+ PrintFPRegister((u8 *)&(fpu->st_space[12]), 3);
+ PrintFPRegister((u8 *)&(fpu->st_space[16]), 4);
+ PrintFPRegister((u8 *)&(fpu->st_space[20]), 5);
+ PrintFPRegister((u8 *)&(fpu->st_space[24]), 6);
+ PrintFPRegister((u8 *)&(fpu->st_space[28]), 7);
+ TRACE_SPRINTF("\t\t\t</fp>\n");
+
+#if 0
+ printk("00 %08x %08x\n", ((u32*)fpu)[0], ((u32*)fpu)[1]);
+ printk("08 %08x %08x\n", ((u32*)fpu)[2], ((u32*)fpu)[3]);
+ printk("10 %08x %08x\n", ((u32*)fpu)[4], ((u32*)fpu)[5]);
+ printk("18 %08x %08x\n", ((u32*)fpu)[6], ((u32*)fpu)[7]);
+ printk("20 %08x %08x\n", ((u32*)fpu)[8], ((u32*)fpu)[9]);
+ printk("28 %08x %08x\n", ((u32*)fpu)[10], ((u32*)fpu)[11]);
+ printk("30 %08x %08x\n", ((u32*)fpu)[12], ((u32*)fpu)[13]);
+ printk("38 %08x %08x\n", ((u32*)fpu)[14], ((u32*)fpu)[15]);
+ printk("40 %08x %08x\n", ((u32*)fpu)[16], ((u32*)fpu)[17]);
+ printk("48 %08x %08x\n", ((u32*)fpu)[18], ((u32*)fpu)[19]);
+ printk("50 %08x %08x\n", ((u32*)fpu)[20], ((u32*)fpu)[21]);
+ printk("58 %08x %08x\n", ((u32*)fpu)[22], ((u32*)fpu)[23]);
+ printk("60 %08x %08x\n", ((u32*)fpu)[24], ((u32*)fpu)[25]);
+ printk("68 %08x %08x\n", ((u32*)fpu)[26], ((u32*)fpu)[27]);
+ printk("70 %08x %08x\n", ((u32*)fpu)[28], ((u32*)fpu)[29]);
+ printk("78 %08x %08x\n", ((u32*)fpu)[30], ((u32*)fpu)[31]);
+ printk("80 %08x %08x\n", ((u32*)fpu)[32], ((u32*)fpu)[33]);
+ printk("88 %08x %08x\n", ((u32*)fpu)[34], ((u32*)fpu)[35]);
+ printk("90 %08x %08x\n", ((u32*)fpu)[36], ((u32*)fpu)[37]);
+ printk("98 %08x %08x\n", ((u32*)fpu)[38], ((u32*)fpu)[39]);
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: mictc_capture_MSR
+//
+// DESCRIPTION:
+// Capture all MSR
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+mictc_capture_MSR(void)
+{
+ // u32 me_cpu = PCPU_GET(cpuid);
+#if 0
+ //msr->msrMIC_CR_SPUBASE = tc_rdmsr(MIC_CR_SPUBASE);
+ //msr->msrIA32_CR_MISC = tc_rdmsr(IA32_CR_MISC);
+ //msr->msrWMT_CR_LASTBRANCH_0 = tc_rdmsr(WMT_CR_LASTBRANCH_0);
+ //msr->msrWMT_CR_LASTBRANCH_1 = tc_rdmsr(WMT_CR_LASTBRANCH_1);
+ msr->msrVMX_MSR_BASE = tc_rdmsr(VMX_MSR_BASE);
+ msr->msrVMX_MSR_BASE_PLUS_1 = tc_rdmsr(VMX_MSR_BASE_PLUS_1);
+ msr->msrVMX_MSR_BASE_PLUS_2 = tc_rdmsr(VMX_MSR_BASE_PLUS_2);
+ msr->msrVMX_MSR_BASE_PLUS_3 = tc_rdmsr(VMX_MSR_BASE_PLUS_3);
+ msr->msrVMX_MSR_BASE_PLUS_4 = tc_rdmsr(VMX_MSR_BASE_PLUS_4);
+ msr->msrVMX_MSR_BASE_PLUS_5 = tc_rdmsr(VMX_MSR_BASE_PLUS_5);
+ msr->msrVMX_MSR_BASE_PLUS_6 = tc_rdmsr(VMX_MSR_BASE_PLUS_6);
+ msr->msrVMX_MSR_BASE_PLUS_7 = tc_rdmsr(VMX_MSR_BASE_PLUS_7);
+ msr->msrVMX_MSR_BASE_PLUS_8 = tc_rdmsr(VMX_MSR_BASE_PLUS_8);
+ msr->msrVMX_MSR_BASE_PLUS_9 = tc_rdmsr(VMX_MSR_BASE_PLUS_9);
+ msr->msrTIME = tc_rdmsr(TIME);
+ msr->msrPINFO = tc_rdmsr(PINFO);
+#endif
+ TRACE_SPRINTF("\t\t\t<msr>\n");
+ TRACE_SPRINTF_MSR(P6_CR_TSC);
+ TRACE_SPRINTF_MSR(X86_CR_APICBASE);
+ TRACE_SPRINTF_MSR(CBOX_SPU_PA_MSR);
+ // This is being added since it is included in the ITP dump as well.
+ TRACE_SPRINTF("\t\t\t\t<reg addr=\"0x%x\">0x%llx</reg>\n", SPU_BASE, (tc_rdmsr(CBOX_SPU_PA_MSR) & 0x7fffffffffffffff) + 0x1000);
+ TRACE_SPRINTF_MSR(CBOX_SPU_SAMPLER_BIND_MSR);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask0);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask1);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask2);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask3);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask4);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask5);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask6);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysMask7);
+ TRACE_SPRINTF_MSR(MSR_EFER & ~0x800); // Force bit 11 to 0
+ TRACE_SPRINTF_MSR(MSR_SF_MASK);
+ TRACE_SPRINTF_MSR(MSR_FSBASE);
+ TRACE_SPRINTF_MSR(MSR_GSBASE);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRcap);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRdefType);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase2);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase0);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase1);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase3);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase4);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase5);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase6);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRphysBase7);
+ TRACE_SPRINTF_MSR(STAR);
+ TRACE_SPRINTF_MSR(LSTAR);
+
+ // MSR_KGSBASE needs some special handling.
+ // On Silicon when a thread transitions from Ring 3->Ring 0 the
+ // first instruction it executes is swapgs which swaps the value
+ // of the current GSBase (which could be 0x0) with the value in
+ // MSR_KGSBASE to get to the per cpu data structure and onwards to the kernel stack.
+ // On Silicon, when the same thread transitions from Ring 0->Ring 3 MSR_KGSBASE gets
+ // the right value as a result of another swapgs on the way back.
+ // Where Trace Capture differs from Silicon is that we take a snapshot while executing
+ // in Ring 0 (when MSR_KGSBASE could be 0x0) but the first instruction
+ // which executes on LarrySim is a Ring 3 instruction.
+ // On the first syscall in LarrySim when it executes a swapgs it sees a MSR_KGSBASE value of 0x0.
+ // LarrySim cannot get to the kernel stack and we correctly hit a double fault (Bang!).
+ // The correct fix is to ensure that LarrySim sees a correct value of
+ // MSR_KGSBASE when it is provided a snapshot.
+//FIXME
+// TRACE_SPRINTF("\t\t\t\t<reg addr=\"0x%x\">0x%lx</reg>\n", MSR_KGSBASE, &__pcpu[me_cpu]);
+
+ // The following MSR's are currently ifdef'd out
+ // because LarrySim barfs on these.
+ // We might need these later.
+#if 0
+ TRACE_SPRINTF_MSR(X86_CR_MTRRfix64K_00000);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRfix16K_80000);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRfix16K_A0000);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_C0000);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_C8000);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_D0000);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_D8000);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_E0000);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_E8000);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_F0000);
+ TRACE_SPRINTF_MSR(X86_CR_MTRRfix4K_F8000);
+ TRACE_SPRINTF_MSR(P5_MC_ADDR);
+ TRACE_SPRINTF_MSR(P5_MC_TYPE);
+ TRACE_SPRINTF_MSR(MSR_TR1);
+ TRACE_SPRINTF_MSR(MSR_TR2);
+ TRACE_SPRINTF_MSR(MSR_TR3);
+ TRACE_SPRINTF_MSR(MSR_TR4);
+ TRACE_SPRINTF_MSR(MSR_TR5);
+ TRACE_SPRINTF_MSR(MSR_TR6);
+ TRACE_SPRINTF_MSR(MSR_TR7);
+ TRACE_SPRINTF_MSR(MSR_TR9);
+ TRACE_SPRINTF_MSR(MSR_TR10);
+ TRACE_SPRINTF_MSR(MSR_TR11);
+ TRACE_SPRINTF_MSR(MSR_TR12);
+ TRACE_SPRINTF_MSR(IA32_APIC_BASE);
+ TRACE_SPRINTF_MSR(IA32_TIME_STAMP_COUNTER);
+ TRACE_SPRINTF_MSR(IA32_PerfCntr0);
+ TRACE_SPRINTF_MSR(IA32_PerfCntr1);
+ TRACE_SPRINTF_MSR(IA32_PerfCntr2);
+ TRACE_SPRINTF_MSR(IA32_PerfCntr3);
+ TRACE_SPRINTF_MSR(PerfFilteredCntr0);
+ TRACE_SPRINTF_MSR(PerfFilteredCntr1);
+ TRACE_SPRINTF_MSR(PerfFilteredCntr2);
+ TRACE_SPRINTF_MSR(PerfFilteredCntr3);
+ TRACE_SPRINTF_MSR(IA32_PerfEvtSel0);
+ TRACE_SPRINTF_MSR(IA32_PerfEvtSel1);
+ TRACE_SPRINTF_MSR(IA32_PerfEvtSel2);
+ TRACE_SPRINTF_MSR(IA32_PerfEvtSel3);
+ TRACE_SPRINTF_MSR(PerfFilterMask);
+ TRACE_SPRINTF_MSR(IA32_PERF_GLOBAL_STATUS);
+ TRACE_SPRINTF_MSR(IA32_PERF_GLOBAL_OVF_CONTROL);
+ TRACE_SPRINTF_MSR(IA32_PERF_GLOBAL_CTRL);
+ TRACE_SPRINTF_MSR(IA32_MCG_CTL);
+ TRACE_SPRINTF_MSR(IA32_MC0_CTRL);
+ TRACE_SPRINTF_MSR(IA32_MC0_STAT);
+ TRACE_SPRINTF_MSR(IA32_MC0_ADDR);
+ TRACE_SPRINTF_MSR(IA32_MC0_MISC);
+ TRACE_SPRINTF_MSR(IA32_MC1_CTRL);
+ TRACE_SPRINTF_MSR(IA32_MC1_STAT);
+ TRACE_SPRINTF_MSR(IA32_MC1_ADDR);
+ TRACE_SPRINTF_MSR(IA32_MC1_MISC);
+ TRACE_SPRINTF_MSR(SYSCALL_FLAG_MASK);
+ TRACE_SPRINTF_MSR(X86_PAT);
+#endif
+ TRACE_SPRINTF("\t\t\t</msr>\n");
+}
+
+
+//u64 rdtsccount = 0, dmasetuptime = 0, dmacomptime=0, hostacktime=0;
+
+#if MIC_TRACE_CAPTURE_MEMORY_TEST
+// Local function to count the number of bytes in a U32
+// This is only used for the memory test.
+static U32 AddBytes(U32 add)
+{
+ U32 sum = 0x0;
+ for (int i=0; i < sizeof(U32); i++)
+ {
+ sum += (add & 0xFF);
+ add = (add >> 8);
+ }
+ return sum;
+}
+#endif
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: mictc_capture_memory
+//
+// DESCRIPTION:
+// Trace Capture IPI Handler
+//
+// PARAMETERS: None
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static int
+mictc_capture_memory(void)
+{
+ long err;
+ long i;
+ long delay_count;
+ long total_transfered = 0;
+
+ g_sizeXferred = 0;
+
+ // Transfer a full buffer.
+ for (i = 0; total_transfered < (max_pfn << PAGE_SHIFT); i++) {
+ printk("before scif_writeto, i = %ld\n", i);
+
+ // Transfer any remainder
+ if ((max_pfn << PAGE_SHIFT) - total_transfered < MICTC_MEM_BUFFER_SIZE) {
+ long remainder = ((uint64_t)max_pfn << PAGE_SHIFT) % MICTC_MEM_BUFFER_SIZE;
+
+ printk("Writing %ld bytes, max_pfn = %ld\n", remainder, max_pfn);
+
+ if ((err = scif_writeto(mictc_endp_data, scif_offset_mem + (i * MICTC_MEM_BUFFER_SIZE),
+ remainder, scif_offset_dst, 0)) < 0) {
+ pr_crit("%s:%s:%d scif_writeto failed with error %ld\n", __FILE__, __FUNCTION__, __LINE__, err);
+ return 1;
+ }
+ total_transfered += remainder;
+ g_sizeXferred = remainder;
+ } else {
+ if ((err = scif_writeto(mictc_endp_data, scif_offset_mem + (i * MICTC_MEM_BUFFER_SIZE),
+ MICTC_MEM_BUFFER_SIZE, scif_offset_dst, 0)) < 0) {
+ pr_crit("%s:%s:%d scif_writeto failed with error %ld\n", __FILE__, __FUNCTION__, __LINE__, err);
+ return 1;
+ }
+ total_transfered += MICTC_MEM_BUFFER_SIZE;
+ g_sizeXferred = MICTC_MEM_BUFFER_SIZE;
+ }
+ *g_traceBufferSizeOffset = g_sizeXferred;
+ printk("before fence\n");
+ err = scif_fence_signal(mictc_endp_data, (off_t)scif_offset_xml + TRACE_STATUS_OFFSET,
+ TRACE_PAGE_READY, 0, 0, SCIF_FENCE_INIT_SELF | SCIF_SIGNAL_LOCAL);
+
+ if (err < 0) {
+ printk("scif_fence_signal failed. err = %ld\n", err);
+ return 1;
+ }
+ printk("TRACE_PAGE_READY %lld bytes\n", g_sizeXferred);
+ g_sizeXferred = 0;
+
+ delay_count = 0;
+ printk("waiting for TRACE_HOST_READY\n");
+
+ while (*g_traceBufferStatusOffset != TRACE_HOST_READY) {
+ cpu_relax();
+ delay_count++;
+ if (delay_count == TRACE_CAPTURE_TIMEOUT) {
+ printk("Memory Dump Timeout. Host did not update @physAddr 0x%lx\n", i << PAGE_SHIFT);
+ return -EBUSY;
+ }
+ }
+ }
+ *g_traceBufferSizeOffset = 0;
+ *g_traceBufferStatusOffset = TRACE_MEM_COMPLETE;
+
+ delay_count = 0;
+
+ while (*g_traceBufferStatusOffset != TRACE_COMPLETE) {
+ cpu_relax();
+ delay_count++;
+ if (delay_count == TRACE_CAPTURE_TIMEOUT) {
+ printk("Trace completion timeout.\n");
+ return -EBUSY;
+ }
+ }
+
+ return 0;
+}
+
+
+//------------------------------------------------------------------------------
+// FUNCTION: mictc_trace_capture
+//
+// DESCRIPTION:
+// Perform all the tasks related to Trace Capture
+// for a particular Hardware Thread.
+// The tasks currently include:
+// General purpose registers
+// Segment registers
+// Debug registers
+// Control registers
+// VPU registers
+// MSRs
+//
+// Note: The SPU is not setup in Linux.
+//
+// PARAMETERS: regs - pointer to the task's registers
+//
+// RETURNS: None
+//
+// TODOS:
+//
+static void
+mictc_trace_capture(struct pt_regs *regs)
+{
+ long delay_count;
+
+// printk("Entering mictc_trace_capture on cpu %d, for process = %s\n", smp_processor_id(), current->comm);
+
+ // Logic to let threads in one by one in order
+
+ while (atomic_read(&cpus_stopped) != smp_processor_id()) {
+ cpu_relax();
+//STH touch_nmi_watchdog();
+ }
+
+ if (smp_processor_id() == 0)
+ {
+ // CPU0 is responsible for preparing the
+ // Trace Capture Header.
+ mictc_prep_header();
+ }
+
+ TRACE_SPRINTF("\t\t<cpu num=\"%d\">\n", smp_processor_id());
+ mictc_capture_general_purpose_reg(regs);
+ mictc_capture_segment_reg(&(trace->segment), regs);
+ mictc_capture_debug_reg();
+ mictc_capture_control_reg();
+ mictc_capture_vector_reg(&(trace->vpustate));
+
+//STH touch_nmi_watchdog(); // Just to be safe
+
+ // The SPU is not setup currently in Linux
+ if (always_false) mictc_capture_SPU_reg();
+
+ mictc_capture_FPU_reg(&(trace->fpu));
+ mictc_capture_MSR();
+
+// printk("In mictc_trace_capture on cpu %d, after MSRs\n", smp_processor_id());
+
+ TRACE_SPRINTF("\t\t</cpu>\n");
+
+ // Each core should flush their caches
+ // as the initiator is going to take a memory
+ // dump soon after.
+ // Not required since DMA should snoop the caches.
+ //wbinvd();
+
+// printk("In mictc_trace_capture on cpu %d, before check for last cpu\n", smp_processor_id());
+
+ if (smp_processor_id() == (num_online_cpus() - 1))
+ {
+ // The last CPU is responsible for preparing the
+ // Trace Capture Trailer.
+ TRACE_SPRINTF("\t</cpu_state>\n");
+
+ TRACE_SPRINTF("</arch_data>\n");
+
+ // Update the size as the Host App needs this information.
+ *g_traceBufferSizeOffset = g_sizeXferred;
+
+ g_sizeXferred = 0;
+
+ // Update the status for the Host App. The CPU register state has been written by all
+ // the hardware threads. The host app polls for this status.
+ *g_traceBufferStatusOffset = TRACE_REG_COMPLETE;
+
+ printk("Completed Arch Dump. Now Beginning Memory Dump. Be patient (~1 min is ETA)..\n");
+
+ delay_count = 0;
+
+ while (*g_traceBufferStatusOffset != TRACE_GET_FILE)
+ {
+ cpu_relax();
+ delay_count++;
+ if (delay_count == TRACE_CAPTURE_TIMEOUT)
+ {
+ printk("Arch Dump Timeout. Host did not update status.\n");
+ break;
+ }
+ }
+ printk("%s out of wait loop.\n", __FUNCTION__);
+ }
+
+// printk("Exiting mictc_trace_capture on cpu %d\n", smp_processor_id());
+}
+
+
+// Starting point for trace_capture.
+static void
+mictc_start_capture(void)
+{
+ long ret;
+ long err;
+ struct scif_portID portID_data;
+ int control_msg = 0;
+ int i;
+ int found_it = 0;
+
+ spin_lock(&mictc_lock);
+ printk("Starting tracecapture on cpu %d. Taking lock.\n", smp_processor_id());
+
+ if (!(g_traceBufferAllocated = kmalloc(MICTC_XML_BUFFER_SIZE, GFP_KERNEL))) {
+ pr_crit("%s:%s:%d kmalloc failed failed with ENOMEM\n", __FILE__, __FUNCTION__, __LINE__);
+ goto done0;
+ }
+
+ pr_crit("%s:%s:%d kmalloc returned %llx\n", __FILE__, __FUNCTION__, __LINE__, (uint64_t)g_traceBufferAllocated);
+
+ g_traceBufferStatusOffset = (u64*)((u64)g_traceBufferAllocated + TRACE_STATUS_OFFSET);
+ g_traceBufferSizeOffset = (u64*)((u64)g_traceBufferAllocated + TRACE_SIZE_OFFSET);
+ g_traceBufferDataOffset = (u32*)((u64)g_traceBufferAllocated + TRACE_DATA_OFFSET);
+ g_traceBufferTriggerOffset = (u32*)((u64)g_traceBufferAllocated + TRACE_TRIGGER_OFFSET);
+
+ *g_traceBufferStatusOffset = TRACE_DATA;
+#if MIC_TRACE_CAPTURE_MEMORY_TEST
+ g_traceBufferChecksumOffset = (u64*)((u64)g_traceBufferAllocated + TRACE_CHECKSUM_OFFSET);
+#endif
+
+ if (!(trace = (struct mictc_trace *)kmalloc(sizeof(struct mictc_trace), GFP_KERNEL))) {
+ pr_crit("%s:%s:%d kmalloc failed failed with ENOMEM\n", __FILE__, __FUNCTION__, __LINE__);
+ goto done1a;
+ }
+
+ pr_crit("%s:%s:%d kmalloc returned %llx\n", __FILE__, __FUNCTION__, __LINE__, (uint64_t)trace);
+
+ memset(trace, 0, sizeof(struct mictc_trace));
+
+ pr_crit("g_traceBufferStatusOffset %llx\n", (uint64_t)g_traceBufferStatusOffset);
+ pr_crit("g_traceBufferSizeOffset %llx\n", (uint64_t)g_traceBufferSizeOffset);
+ pr_crit("g_traceBufferDataOffset %llx\n", (uint64_t)g_traceBufferDataOffset);
+
+ // Data channel
+ if (!(mictc_endp_data = scif_open())) {
+ pr_crit("%s:%s:%d scif_open failed with ENOMEM\n", __FILE__, __FUNCTION__, __LINE__);
+ return;
+ }
+
+ if ((ret = scif_bind(mictc_endp_data, MICTC_SCIF_PORT_DATA)) < 0) {
+ pr_crit("%s:%s:%d scif_bind failed with error %ld\n", __FILE__, __FUNCTION__, __LINE__, ret);
+ goto done1;
+ }
+
+ portID_data.node = 0;
+ portID_data.port = MICTC_SCIF_PORT_DATA;
+
+ if ((ret = scif_connect(mictc_endp_data, &portID_data)) < 0) {
+ pr_crit("%s:%s:%d scif_connect failed with error %ld\n", __FILE__, __FUNCTION__, __LINE__, ret);
+ goto done1;
+ }
+
+ if ((ret = (long)scif_register(mictc_endp_data,
+ g_traceBufferAllocated,
+ MICTC_XML_BUFFER_SIZE,
+ 0, // suggested_offset,
+ SCIF_PROT_READ | SCIF_PROT_WRITE,
+ SCIF_MAP_KERNEL)) < 0) {
+ if (ret > -300) {
+ pr_crit("%s:%s:%d scif_register failed failed with %ld\n", __FILE__, __FUNCTION__, __LINE__, ret);
+ goto done2;
+ }
+ }
+ scif_offset_xml = ret;
+ pr_crit("%s:%s:%d scif_register scif_offset_xml = %lx\n", __FILE__, __FUNCTION__, __LINE__, scif_offset_xml);
+
+ // Register all of physical memory.
+ if ((ret = (long)scif_register(mictc_endp_data,
+ __va(0), // Physical page 0
+ max_pfn << PAGE_SHIFT,
+ 0, // suggested_offset,
+ SCIF_PROT_READ | SCIF_PROT_WRITE,
+ SCIF_MAP_KERNEL)) < 0) {
+ if (ret > -300) {
+ pr_crit("%s:%s:%d scif_register failed failed with %ld\n", __FILE__, __FUNCTION__, __LINE__, ret);
+ goto done2;
+ }
+ }
+ scif_offset_mem = ret;
+ pr_crit("%s:%s:%d scif_register scif_offset_mem = %lx\n", __FILE__, __FUNCTION__, __LINE__, scif_offset_mem);
+
+ BARRIER(mictc_endp_data, "before barrier");
+
+ if ((err = scif_recv(mictc_endp_data, &scif_offset_dst, sizeof(scif_offset_dst), SCIF_RECV_BLOCK)) <= 0) {
+ pr_crit("%s:%s:%d scif_recv failed with err %ld\n", __FILE__, __FUNCTION__, __LINE__, err);
+ goto close;
+ }
+
+// g_traceBufferDataOffset = (u32 *)ret;
+// pr_crit("%s:%s:%d scif_register ret %lx\n", __FILE__, __FUNCTION__, __LINE__, scif_offset);
+
+ if ((err = scif_send(mictc_endp_data, &scif_offset_xml, sizeof(scif_offset_xml), SCIF_SEND_BLOCK)) <= 0) {
+ pr_crit("%s:%s:%d scif_send failed with err %ld\n", __FILE__, __FUNCTION__, __LINE__, err);
+ goto close;
+ }
+
+ while (*g_traceBufferStatusOffset != TRACE_HOST_READY)
+ {
+ msleep(100);
+ touch_nmi_watchdog();
+ }
+
+ // Get trigger data.
+ for (i = 0; i < TRACE_TRIGGER_MAX; i++) {
+ g_traceTriggers[i] = *g_traceBufferTriggerOffset;
+ printk("Found trace trigger %d\n", g_traceTriggers[i]);
+ g_traceBufferTriggerOffset++;
+
+ if (g_traceTriggers[i] == TRACE_EOL) break;
+ }
+
+ // Is the trigger data empty? If so, accept everything.
+ if (g_traceTriggers[0] == TRACE_EOL) {
+ printk("Trace trigger data is empty.\n");
+ found_it = 1;
+ } else if (g_traceTriggers[0] == TRACE_IGNORE) {
+ printk("Ignoring current trace.");
+ } else {
+ // See if g_traceCurrentTrigger is in the trigger data.
+ // If not, abort this trace.
+ for (i = 0; i < TRACE_TRIGGER_MAX; i++) {
+ if (g_traceTriggers[i] == TRACE_EOL) break;
+
+ if (g_traceTriggers[i] == g_traceCurrentTrigger) {
+ found_it = 1;
+ printk("Matched trace trigger %d\n", g_traceTriggers[i]);
+ break;
+ }
+ }
+ }
+
+ if (!found_it) {
+ // Abort this trace
+ printk("Trace trigger did not match -- aborting.\n");
+ *g_traceBufferStatusOffset = TRACE_ABORTED;
+ goto done3;
+ }
+
+ if (always_false) {
+ // Mmap memory at 0xfee03000 physical.
+ spu_addr = ioremap(0xfee03000, 0x1000);
+ if (! spu_addr) {
+ pr_crit("%s ioremap failed.\n", __FUNCTION__);
+ goto done3;
+ }
+ printk("CPU ioremap %p\n", spu_addr);
+ }
+
+ cli; // Interrupts off
+ atomic_set(&cpus_stopped, 0);
+ atomic_set(&cpus_released, 0);
+ // Send IPI to capture all other cpus.
+ apic->send_IPI_allbutself(NMI_VECTOR);
+ mictc_trace_capture(task_pt_regs(current));
+ atomic_inc(&cpus_stopped);
+
+ pr_debug("start_capture: Entering wait loop until lock count %d >= %d on cpu %d\n", atomic_read(&cpus_stopped), num_online_cpus() - 1, smp_processor_id());
+
+ { int ctr = 0;
+ // Wait for every other CPU to finish its trace capture tasks.
+ while (atomic_read(&cpus_stopped) < num_online_cpus()) {
+ cpu_relax();
+//STH touch_nmi_watchdog();
+ if (ctr++ > 1000000) {
+ ctr = 0;
+ printk("%s:%d *** waiting loop cpus_stopped = %d\n", __FUNCTION__, __LINE__, atomic_read(&cpus_stopped));
+ }
+ }
+ }
+
+ printk("%s out of wait loop.\n", __FUNCTION__);
+
+ // Get a memory dump here before exiting.
+ err = mictc_capture_memory();
+
+ printk("Completed Memory Dump.\n");
+// printk("Completed Memory Dump. DMASetuptime = %ld , DMATime = %ld, HostAckTime = %ld\n", dmasetuptime, dmacomptime, hostacktime);
+
+ // Now release all cores.
+ atomic_set(&cpus_stopped, num_online_cpus() + 1);
+
+ // Wait for every other CPU to be released
+ while (atomic_read(&cpus_released) < num_online_cpus() - 1) {
+ // msleep(2000);
+ cpu_relax();
+ touch_nmi_watchdog();
+ }
+ sti; // Interrupts on
+
+ // FIXME This cleanup probably needs to be checked.
+ close:
+ if (always_false) {
+ iounmap(spu_addr);
+ }
+ done3:
+// scif_unregister(mictc_endp_data, scif_offset, MICTC_XML_BUFFER_SIZE);
+ done2:
+ done1:
+ scif_close(mictc_endp_data);
+ kfree(trace);
+ done1a:
+ kfree(g_traceBufferAllocated);
+ spin_unlock(&mictc_lock);
+ done0:
+ printk("Ending tracecapture on cpu %d. Releasing lock.\n", smp_processor_id());
+}
+EXPORT_SYMBOL(mictc_start_capture);
+
+
+/*
+ * mictc_handle_exception() - main entry point from a kernel exception
+ *
+ * Locking hierarchy:
+ * interface locks, if any (begin_session)
+ */
+int
+mictc_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
+{
+ // Interrupts are off.
+
+ // printk("Entering mictc_handle_exception on cpu %d pid: %d, name: %s\n", smp_processor_id(), current->pid, current->comm);
+
+ mictc_trace_capture(regs);
+ atomic_inc(&cpus_stopped);
+ pr_debug("handler: Entering wait loop until lock count %d >= %d on cpu %d\n", atomic_read(&cpus_stopped), num_online_cpus() - 1, smp_processor_id());
+ // Wait for every other CPU to finish its Trace Capture Tasks.
+ // This test is for num_online_cpus+1 to hold all threads that are
+ // in interrupt context so that the main thread can dump memory.
+ while (atomic_read(&cpus_stopped) < num_online_cpus() + 1) {
+ cpu_relax();
+//STH touch_nmi_watchdog();
+ }
+
+ atomic_inc(&cpus_released);
+
+ printk("Exiting mictc_handle_exception on cpu %d %s\n", smp_processor_id(), current->comm);
+ return 1;
+}
+
+
+static int __mictc_notify(struct die_args *args, unsigned long cmd)
+{
+ struct pt_regs *regs = args->regs;
+#if 0
+ switch (cmd) {
+ case DIE_NMI:
+ if (atomic_read(&kgdb_active) != -1) {
+ /* KGDB CPU roundup */
+ kgdb_nmicallback(smp_processor_id(), regs);
+ was_in_debug_nmi[smp_processor_id()] = 1;
+ touch_nmi_watchdog();
+ return NOTIFY_STOP;
+ }
+ return NOTIFY_DONE;
+
+ case DIE_NMIUNKNOWN:
+ if (was_in_debug_nmi[smp_processor_id()]) {
+ was_in_debug_nmi[smp_processor_id()] = 0;
+ return NOTIFY_STOP;
+ }
+ return NOTIFY_DONE;
+
+ case DIE_DEBUG:
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+ if (user_mode(regs))
+ return single_step_cont(regs, args);
+ break;
+ } else if (test_thread_flag(TIF_SINGLESTEP))
+ /* This means a user thread is single stepping
+ * a system call which should be ignored
+ */
+ return NOTIFY_DONE;
+ /* fall through */
+ default:
+ if (user_mode(regs))
+ return NOTIFY_DONE;
+ }
+#endif
+ if (cmd == DIE_NMI) {
+ if (mictc_handle_exception(args->trapnr, args->signr, cmd, regs)) {
+ touch_nmi_watchdog();
+ return NOTIFY_STOP;
+ }
+ } else {
+ touch_nmi_watchdog();
+ return NOTIFY_DONE;
+ }
+
+ /* Must touch watchdog before return to normal operation */
+ touch_nmi_watchdog();
+ return NOTIFY_STOP;
+}
+
+
+static int
+mictc_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = __mictc_notify(ptr, cmd);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+
+/*
+ * This function is called whenever a process tries to do an ioctl on our
+ * device file. We get two extra parameters (additional to the inode and file
+ * structures, which all device functions get): the number of the ioctl called
+ * and the parameter given to the ioctl function.
+ *
+ * If the ioctl is write or read/write (meaning output is returned to the
+ * calling process), the ioctl call returns the output of this function.
+ *
+ */
+long device_ioctl(
+ struct file *file, /* ditto */
+ unsigned int ioctl_num, /* number and param for ioctl */
+ unsigned long ioctl_param)
+{
+ // Switch according to the ioctl called
+ switch (ioctl_num) {
+ case MICTC_START_CAPTURE:
+
+ // ioctl_param contains the trace trigger number.
+ // Save it to check against the g_traceTrigger array.
+ g_traceCurrentTrigger = (u32)ioctl_param;
+ printk("IOCTL trace trigger %ld\n", ioctl_param);
+ mictc_start_capture();
+ break;
+ default:
+ printk("Invalid ioctl.\n");
+ return -ENXIO;
+ }
+ return 0;
+}
+
+
+/*
+ * This is called whenever a process attempts to open the device file
+ */
+static int device_open(struct inode *inode, struct file *file)
+{
+#ifdef DEBUG
+ printk(KERN_INFO "device_open(%p)\n", file);
+#endif
+
+ /*
+ * We don't want to talk to two processes at the same time
+ */
+ if (Device_Open)
+ return -EBUSY;
+
+ Device_Open++;
+ try_module_get(THIS_MODULE);
+ return 0;
+}
+
+static int device_release(struct inode *inode, struct file *file)
+{
+#ifdef DEBUG
+ printk(KERN_INFO "device_release(%p,%p)\n", inode, file);
+#endif
+
+ /*
+ * We're now ready for our next caller
+ */
+ Device_Open--;
+
+ module_put(THIS_MODULE);
+ return 0;
+}
+
+
+/*
+ * This structure will hold the functions to be called
+ * when a process does something to the device we
+ * created. Since a pointer to this structure is kept in
+ * the devices table, it can't be local to
+ * init_module. NULL is for unimplemented functions.
+ */
+struct file_operations Fops = {
+ // .read = device_read,
+ // .write = device_write,
+ .unlocked_ioctl = device_ioctl,
+ .open = device_open,
+ .release = device_release, /* a.k.a. close */
+};
+
+static struct notifier_block mictc_notifier = {
+ .notifier_call = mictc_notify,
+ .priority = 0x7fffffff /* we need to be notified first */
+};
+
+
+/*
+ * mictc_init - Register our notifier
+ *
+ */
+static
+int mictc_init(void)
+{
+ int ret_val;
+ /*
+ * Register the character device (atleast try)
+ */
+ ret_val = register_chrdev(MICTC_MAJOR_NUM, MICTC_DEVICE_NAME, &Fops);
+
+ /*
+ * Negative values signify an error
+ */
+ if (ret_val < 0) {
+ printk(KERN_ALERT "%s failed with %d\n",
+ "Sorry, registering the character device ", ret_val);
+ return ret_val;
+ }
+
+ printk(KERN_INFO "%s The major device number is %d.\n",
+ "Registeration is a success", MICTC_MAJOR_NUM);
+ printk(KERN_INFO "To use trace capture you'll have to create a device file:\n");
+ printk(KERN_INFO "mknod %s c %d 0\n", MICTC_FILE_NAME, MICTC_MAJOR_NUM);
+
+ return register_die_notifier(&mictc_notifier);
+
+}
+
+
+static
+void mictc_exit(void)
+{
+ return;
+}
+
+module_init(mictc_init);
+module_exit(mictc_exit);
+
+MODULE_AUTHOR("Intel Corp. 2011 (sth " __DATE__ ") ver " TC_VER);
+MODULE_DESCRIPTION("Trace Capture module for K1OM");
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+/*
+ * Trace Capture module common declarations
+ *
+ * Contains configuration, constants and function prototypes
+ * for the Trace Capture module.
+ */
+
+#ifndef _MICTC_H_
+#define _MICTC_H_ 1
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+//#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <linux/kdebug.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h> // for get_user and put_user
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+#include <asm/apicdef.h>
+#include <asm/system.h>
+#include <asm/apic.h>
+#include <asm/nmi.h>
+#include <asm/irq_regs.h>
+#include <asm/svm.h>
+#include <asm/desc.h>
+#include <linux/ioctl.h>
+
+#ifndef __SCIF_H__
+#include <scif.h>
+#endif
+
+/*
+ * Version info: M.NP
+ */
+
+#define TC_MAJOR "0"
+#define TC_MINOR "1"
+#define TC_PATCH "a"
+#define TC_VER TC_MAJOR "." TC_MINOR TC_PATCH
+
+// These are common to the Host App
+// and the MIC driver Trace Capture Feature
+// COMMON DEFINES START HERE
+enum TRACE_COMMAND
+{
+ TRACE_NOP = 100,
+ TRACE_DATA,
+ TRACE_HOST_READY,
+ TRACE_DONE,
+ TRACE_ERROR,
+ TRACE_PRINT,
+ TRACE_GET_FILE,
+ TRACE_PAGE_READY,
+ TRACE_REG_COMPLETE,
+ TRACE_MEM_COMPLETE,
+ TRACE_COMPLETE,
+ TRACE_ABORTED
+};
+
+// IOCTL
+#define MICTC_MAJOR_NUM 's'
+#define MICTC_DEVICE_NAME "trace_capture"
+#define MICTC_FILE_NAME "/dev/trace_capture"
+
+#define MICTC_START_CAPTURE _IOW(MICTC_MAJOR_NUM, 0xff, int)
+
+// Use 2MB for KNF and 4MB for K1OM (auto-detected).
+#define MICTC_XML_BUFFER_SIZE (2 * 1024UL * 1024UL)
+
+#define MICTC_MEM_BUFFER_SIZE (1 * 1024UL * 1024UL * 1024UL)
+
+// Shared memory constants
+#define TRACE_STATUS_OFFSET 8
+#define TRACE_SIZE_OFFSET 16
+
+// Enable/Disable Memory Test.
+// This MUST be enabled simultaneously on Host App as well.
+#define MIC_TRACE_CAPTURE_MEMORY_TEST 0
+
+#if MIC_TRACE_CAPTURE_MEMORY_TEST
+#define TRACE_CHECKSUM_OFFSET 24
+#endif
+
+#define TRACE_TRIGGER_MAX 10
+#define TRACE_TRIGGER_OFFSET 28
+#define TRACE_DATA_OFFSET 4096
+
+// Used to indicate the end of the list for trace triggers.
+#define TRACE_EOL 0xffffffff
+// Used for trace counts to indicate that the driver should ignore current trace.
+// Only meaningful when it is first in the list of trace triggers -- the entries
+// after it are ignored. Trace counts supersede trace triggers.
+#define TRACE_IGNORE 0xfffffffe
+
+// Types of Triggers - Refer to uOS Trace Capture Wiki for Usage
+// Generic counter
+#define TRACE_HOST_GENERIC_COUNTER 0x1
+// Async Flip counter
+#define TRACE_HOST_FRAME_COUNTER 0x2
+// COMMON DEFINES END HERE
+
+// MSR's defined in the trace file sent during REQs
+// Are these all valid for L1OM??
+#define P6_CR_TSC 0x10
+#define X86_CR_APICBASE 0x1b
+#define MIC_CR_SPUBASE 0x1c
+#define IA32_CR_MISC 0x1a0
+#define WMT_CR_LASTBRANCH_0 0x1db
+#define WMT_CR_LASTBRANCH_1 0x1dc
+#define X86_CR_MTRRphysMask0 0x201
+#define X86_CR_MTRRphysMask1 0x203
+#define X86_CR_MTRRphysMask2 0x205
+#define X86_CR_MTRRphysMask3 0x207
+#define X86_CR_MTRRphysMask4 0x209
+#define X86_CR_MTRRphysMask5 0x20b
+#define X86_CR_MTRRphysMask6 0x20d
+#define X86_CR_MTRRphysMask7 0x20f
+#define IA32_CR_PAT 0x277
+#define IA32_MTRR_DEF_TYPE 0x2ff
+#define VMX_MSR_BASE 0x480
+#define VMX_MSR_BASE_PLUS_1 0x481
+#define VMX_MSR_BASE_PLUS_2 0x482
+#define VMX_MSR_BASE_PLUS_3 0x483
+#define VMX_MSR_BASE_PLUS_4 0x484
+#define VMX_MSR_BASE_PLUS_5 0x485
+#define VMX_MSR_BASE_PLUS_6 0x486
+#define VMX_MSR_BASE_PLUS_7 0x487
+#define VMX_MSR_BASE_PLUS_8 0x488
+#define VMX_MSR_BASE_PLUS_9 0x489
+#define TIME 0x4711
+#define PINFO 0x4712
+#define X86_CR_MTRRdefType 0x2ff
+#define X86_CR_MTRRcap 0xfe
+#define X86_CR_MTRRphysBase0 0x200
+#define X86_CR_MTRRphysBase1 0x202
+#define X86_CR_MTRRphysBase2 0x204
+#define X86_CR_MTRRphysBase3 0x206
+#define X86_CR_MTRRphysBase4 0x208
+#define X86_CR_MTRRphysBase5 0x20a
+#define X86_CR_MTRRphysBase6 0x20c
+#define X86_CR_MTRRphysBase7 0x20e
+#define X86_CR_MTRRfix64K_00000 0x250
+#define X86_CR_MTRRfix16K_80000 0x258
+#define X86_CR_MTRRfix16K_A0000 0x259
+#define X86_CR_MTRRfix4K_C0000 0x268
+#define X86_CR_MTRRfix4K_C8000 0x269
+#define X86_CR_MTRRfix4K_D0000 0x26a
+#define X86_CR_MTRRfix4K_D8000 0x26b
+#define X86_CR_MTRRfix4K_E0000 0x26c
+#define X86_CR_MTRRfix4K_E8000 0x26d
+#define X86_CR_MTRRfix4K_F0000 0x26e
+#define X86_CR_MTRRfix4K_F8000 0x26f
+#define P5_MC_ADDR 0x0
+#define P5_MC_TYPE 0x1
+#define MSR_TR1 0x2
+#define MSR_TR2 0x4
+#define MSR_TR3 0x5
+#define MSR_TR4 0x6
+#define MSR_TR5 0x7
+#define MSR_TR6 0x8
+#define MSR_TR7 0x9
+#define MSR_TR9 0xb
+#define MSR_TR10 0xc
+#define MSR_TR11 0xd
+#define MSR_TR12 0xe
+#define IA32_APIC_BASE 0x1b
+#define IA32_TIME_STAMP_COUNTER 0x10
+#define IA32_PerfCntr0 0x20
+#define IA32_PerfCntr1 0x21
+#define IA32_PerfCntr2 0x22
+#define IA32_PerfCntr3 0x23
+#define PerfFilteredCntr0 0x24
+#define PerfFilteredCntr1 0x25
+#define PerfFilteredCntr2 0x26
+#define PerfFilteredCntr3 0x27
+#define IA32_PerfEvtSel0 0x28
+#define IA32_PerfEvtSel1 0x29
+#define IA32_PerfEvtSel2 0x2a
+#define IA32_PerfEvtSel3 0x2b
+#define PerfFilterMask 0x2c
+#define IA32_PERF_GLOBAL_STATUS 0x2d
+#define IA32_PERF_GLOBAL_OVF_CONTROL 0x2e
+#define IA32_PERF_GLOBAL_CTRL 0x2f
+#define IA32_MCG_CTL 0x17b
+#define IA32_MC0_CTRL 0x400
+#define IA32_MC0_STAT 0x401
+#define IA32_MC0_ADDR 0x402
+#define IA32_MC0_MISC 0x403
+#define IA32_MC1_CTRL 0x404
+#define IA32_MC1_STAT 0x405
+#define IA32_MC1_ADDR 0x406
+#define IA32_MC1_MISC 0x407
+#define STAR 0xc0000081
+#define LSTAR 0xc0000082
+#define SYSCALL_FLAG_MASK 0xc0000084
+#define X86_PAT 0x277
+#define SPU_BASE 0x1C
+
+
+#endif /* Recursion block */
--- /dev/null
+# do not edit this file, it will be overwritten on update
+# initramfs:default
+
+# MIC SCIF
+KERNEL=="scif", ACTION=="add", NAME="mic/%k",MODE="0666", RUN+="/bin/chmod og+x /dev/mic"
+KERNEL=="ctrl", ACTION=="add", NAME="mic/%k", MODE="0666"
+
+# Bring up network interfaces manually on rhel7 after module reload
+KERNEL=="mic*", SUBSYSTEM=="net", RUN+="/bin/sh -c '/bin/grep 7. /etc/redhat-release && /sbin/ifup %k'"
--- /dev/null
+michvc-objs := hvc_mic.o
+
+obj-m := michvc.o
--- /dev/null
+/*
+ * hvc_console.h
+ * Copyright (C) 2005 IBM Corporation
+ *
+ * Author(s):
+ * Ryan S. Arnold <rsa@us.ibm.com>
+ *
+ * hvc_console header information:
+ * moved here from arch/powerpc/include/asm/hvconsole.h
+ * and drivers/char/hvc_console.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef HVC_CONSOLE_H
+#define HVC_CONSOLE_H
+#include <linux/kref.h>
+#include <linux/tty.h>
+#include <linux/spinlock.h>
+
+/*
+ * This is the max number of console adapters that can/will be found as
+ * console devices on first stage console init. Any number beyond this range
+ * can't be used as a console device but is still a valid tty device.
+ */
+#define MAX_NR_HVC_CONSOLES 16
+
+/*
+ * The Linux TTY code does not support dynamic addition of tty derived devices
+ * so we need to know how many tty devices we might need when space is allocated
+ * for the tty device. Since this driver supports hotplug of vty adapters we
+ * need to make sure we have enough allocated.
+ */
+#define HVC_ALLOC_TTY_ADAPTERS 8
+
+struct hvc_struct {
+ spinlock_t lock;
+ int index;
+ struct tty_struct *tty;
+ int count;
+ int do_wakeup;
+ char *outbuf;
+ int outbuf_size;
+ int n_outbuf;
+ uint32_t vtermno;
+ const struct hv_ops *ops;
+ int irq_requested;
+ int data;
+ struct winsize ws;
+ struct work_struct tty_resize;
+ struct list_head next;
+ struct kref kref; /* ref count & hvc_struct lifetime */
+};
+
+/* implemented by a low level driver */
+struct hv_ops {
+ int (*get_chars)(uint32_t vtermno, char *buf, int count);
+ int (*put_chars)(uint32_t vtermno, const char *buf, int count);
+
+ /* Callbacks for notification. Called in open, close and hangup */
+ int (*notifier_add)(struct hvc_struct *hp, int irq);
+ void (*notifier_del)(struct hvc_struct *hp, int irq);
+ void (*notifier_hangup)(struct hvc_struct *hp, int irq);
+};
+
+/* Register a vterm and a slot index for use as a console (console_init) */
+extern int hvc_instantiate(uint32_t vtermno, int index,
+ const struct hv_ops *ops);
+
+/* register a vterm for hvc tty operation (module_init or hotplug add) */
+extern struct hvc_struct * hvc_alloc(uint32_t vtermno, int data,
+ const struct hv_ops *ops, int outbuf_size);
+/* remove a vterm from hvc tty operation (module_exit or hotplug remove) */
+extern int hvc_remove(struct hvc_struct *hp);
+
+/* data available */
+int hvc_poll(struct hvc_struct *hp);
+void hvc_kick(void);
+
+/* Resize hvc tty terminal window */
+extern void __hvc_resize(struct hvc_struct *hp, struct winsize ws);
+
+static inline void hvc_resize(struct hvc_struct *hp, struct winsize ws)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&hp->lock, flags);
+ __hvc_resize(hp, ws);
+ spin_unlock_irqrestore(&hp->lock, flags);
+}
+
+/* default notifier for irq based notification */
+extern int notifier_add_irq(struct hvc_struct *hp, int data);
+extern void notifier_del_irq(struct hvc_struct *hp, int data);
+extern void notifier_hangup_irq(struct hvc_struct *hp, int data);
+
+
+#if defined(CONFIG_XMON) && defined(CONFIG_SMP)
+#include <asm/xmon.h>
+#else
+static inline int cpus_are_in_xmon(void)
+{
+ return 0;
+}
+#endif
+
+#endif // HVC_CONSOLE_H
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include "hvc_console.h"
+#include <mic/micscif_rb.h>
+#include <mic/micvcons.h>
+#include <asm/io.h>
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/irqflags.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <asm/bug.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/debugfs.h>
+#include <linux/interrupt.h>
+#include <asm/mic/mic_common.h>
+
+#define MIC_COOKIE 0xc0c0
+#define MIC_KNC 1
+
+static long vcons_hdr_addr;
+static struct micscif_rb mic_out_buf;
+static struct micscif_rb mic_in_buf;
+
+struct vcons_info {
+ struct vcons_buf *hdr;
+ struct vcons_mic_header *mic_hdr;
+ char *vcons_op_buf;
+ char *vcons_ip_buf;
+};
+
+static struct vcons_info vcons_info;
+static int dbg = 0;
+
+/* Receive data from the host (mic i/p buffer) */
+static int hvc_mic_get_chars(uint32_t vt, char *buf, int count)
+{
+ int ret, len, get_count;
+
+ len = micscif_rb_count(&mic_in_buf, count);
+ get_count = min(len, count);
+ ret = micscif_rb_get_next(&mic_in_buf, buf, get_count);
+ if (ret == get_count)
+ micscif_rb_update_read_ptr(&mic_in_buf);
+
+ return ret;
+}
+
+/* Send data to the host (mic o/p buffer) */
+static int hvc_mic_put_chars(uint32_t vt, const char *buf, int count)
+{
+ int ret;
+ int put_count;
+ volatile int *host_status =
+ (volatile int *)&vcons_info.mic_hdr->host_status;
+
+ put_count = min(micscif_rb_space(&mic_out_buf), count);
+ if (put_count) {
+ ret = micscif_rb_write(&mic_out_buf, (void *)buf, put_count);
+ BUG_ON(ret);
+ micscif_rb_commit(&mic_out_buf);
+ } else if (*host_status != MIC_VCONS_HOST_OPEN)
+ return count;
+ return put_count;
+}
+
+
+static irqreturn_t hvc_mic_handle_interrupt(int irq, void *dev_id)
+{
+ struct hvc_struct *hp = (struct hvc_struct *)dev_id;
+ if (hvc_poll(hp)) {
+ hvc_kick();
+ return IRQ_HANDLED;
+ }
+ return IRQ_NONE;
+}
+
+static int hvc_mic_notifier_add_irq(struct hvc_struct *hp, int irq)
+{
+ int ret = request_irq(get_sbox_irq(HVC_SBOX_INT_IDX),
+ hvc_mic_handle_interrupt, IRQF_DISABLED,
+ "hvc intr", hp);
+ if (ret) {
+ printk("Unable to register interrupt\n");
+ return ret;
+ }
+ hp->irq_requested = 1;
+ return 0;
+}
+
+static void hvc_mic_notifier_del_irq(struct hvc_struct *hp, int irq)
+{
+ if (hp->irq_requested) {
+ free_irq(get_sbox_irq(HVC_SBOX_INT_IDX), hp);
+ hp->irq_requested = 0;
+ }
+}
+
+static void hvc_mic_notifier_hangup_irq(struct hvc_struct *hp, int irq)
+{
+ hvc_mic_notifier_del_irq(hp, irq);
+}
+
+static const struct hv_ops hvc_mic_ops = {
+ .get_chars = hvc_mic_get_chars,
+ .put_chars = hvc_mic_put_chars,
+ .notifier_add = hvc_mic_notifier_add_irq,
+ .notifier_del = hvc_mic_notifier_del_irq,
+ .notifier_hangup = hvc_mic_notifier_hangup_irq,
+};
+
+static void dump_vcons_hdr(struct vcons_buf *hdr)
+{
+ printk(KERN_ERR "host_magic\t%x\n", readl(&hdr->host_magic));
+ printk(KERN_ERR "mic_magic\t%x\n", readl(&hdr->mic_magic));
+ printk(KERN_ERR "o_buf_dma_addr\t%x\n", readl(&hdr->o_buf_dma_addr));
+ printk(KERN_ERR "o_wr\t%x\n", readl(&hdr->o_wr));
+ printk(KERN_ERR "o_size\t%x\n", readl(&hdr->o_size));
+ printk(KERN_ERR "i_hdr_addr\t%lx\n", readq(&hdr->i_hdr_addr));
+ printk(KERN_ERR "i_buf_addr\t%lx\n", readq(&hdr->i_buf_addr));
+ printk(KERN_ERR "i_rd\t%x\n", readl(&hdr->i_rd));
+}
+
+static int mic_cons_init(void)
+{
+ int rc;
+
+ if ((rc = hvc_instantiate(MIC_COOKIE, 0, &hvc_mic_ops)))
+ printk(KERN_ERR "error instantiating hvc console\n");
+
+ return rc;
+}
+
+static struct hvc_struct *hp;
+static int __init hvc_mic_init(void)
+{
+ struct vcons_buf *hdr = NULL;
+ struct vcons_buf tmp_hdr;
+ int err = 0;
+ char *hvc_buf;
+ u8 card_type=0;
+ uint16_t host_rb_ver, mic_rb_ver;
+
+#if defined(CONFIG_MK1OM)
+ card_type = MIC_KNC;
+#endif
+ hvc_buf = (char *)get_zeroed_page(GFP_KERNEL);
+ if (!hvc_buf) {
+ printk(KERN_ERR "unable to allocate vcons buffer\n");
+ return -ENOMEM;
+ }
+ if (card_type == MIC_KNC) {
+ vcons_info.vcons_ip_buf = hvc_buf;
+ vcons_info.mic_hdr = (struct vcons_mic_header *)kzalloc(sizeof(struct vcons_mic_header), GFP_KERNEL);
+ if (!vcons_info.mic_hdr) {
+ free_page((unsigned long)hvc_buf);
+ printk(KERN_ERR "unable to allocate vcons header\n");
+ return -ENOMEM;
+ }
+ } else {
+ vcons_info.vcons_ip_buf = hvc_buf + PAGE_SIZE/2;
+ vcons_info.mic_hdr = (struct vcons_mic_header *)hvc_buf;
+ }
+
+ vcons_info.hdr = hdr = ioremap_nocache(vcons_hdr_addr,
+ sizeof(struct vcons_buf));
+ if (!hdr) {
+ printk(KERN_ERR "unable to map vcons header\n");
+ err = -ENOMEM;
+ goto error;
+ }
+
+ if (dbg)
+ dump_vcons_hdr(hdr);
+
+ if (readl(&hdr->host_magic) != MIC_HOST_VCONS_READY) {
+ printk(KERN_ERR "host not ready, giving up\n");
+ err = -ENODEV;
+ goto error;
+ }
+
+ host_rb_ver = readw(&hdr->host_rb_ver);
+ mic_rb_ver = micscif_rb_get_version();
+ writew(mic_rb_ver, &hdr->mic_rb_ver);
+ if (host_rb_ver != mic_rb_ver) {
+ printk(KERN_ERR "Card and host ring buffer versions mismatch.");
+ printk(KERN_ERR "Card ver: %d, Host ver: %d \n", mic_rb_ver,
+ host_rb_ver);
+ writel(MIC_VCONS_RB_VER_ERR, &hdr->mic_magic);
+ err = -ENXIO;
+ goto error;
+ }
+ memcpy_fromio(&tmp_hdr, hdr, sizeof(struct vcons_buf));
+
+ if (!(vcons_info.vcons_op_buf = ioremap_nocache(tmp_hdr.o_buf_dma_addr,
+ tmp_hdr.o_size))) {
+ printk(KERN_ERR "unable to map vcons output buffer\n");
+ err = -ENOMEM;
+ goto error;
+ }
+
+ tmp_hdr.i_hdr_addr = virt_to_phys(vcons_info.mic_hdr);
+ tmp_hdr.i_buf_addr = virt_to_phys(vcons_info.vcons_ip_buf);
+
+ if (card_type == MIC_KNC)
+ tmp_hdr.i_size = PAGE_SIZE;
+ else
+ tmp_hdr.i_size = PAGE_SIZE/2;
+
+ micscif_rb_init(&mic_out_buf, (volatile uint32_t *)&vcons_info.mic_hdr->o_rd,
+ (volatile uint32_t *)&hdr->o_wr,
+ (volatile uint32_t *)vcons_info.vcons_op_buf,
+ tmp_hdr.o_size);
+
+ micscif_rb_init(&mic_in_buf,
+ (volatile uint32_t *)&hdr->i_rd,
+ (volatile uint32_t *)&vcons_info.mic_hdr->i_wr,
+ (volatile uint32_t *)vcons_info.vcons_ip_buf,
+ tmp_hdr.i_size);
+
+ mic_cons_init();
+ hp = hvc_alloc(MIC_COOKIE, 2, &hvc_mic_ops, 128);
+
+ if (IS_ERR(hp)) {
+ printk(KERN_ERR "unable to allocate hvc console\n");
+ err = PTR_ERR(hp);
+ } else {
+ writeq(tmp_hdr.i_hdr_addr, &hdr->i_hdr_addr);
+ writeq(tmp_hdr.i_buf_addr, &hdr->i_buf_addr);
+ writel(tmp_hdr.i_size, &hdr->i_size);
+ writel(MIC_VCONS_READY, &hdr->mic_magic);
+ if (dbg)
+ dump_vcons_hdr(hdr);
+
+ return 0;
+ }
+error:
+ if (hdr)
+ iounmap(hdr);
+ if (vcons_info.vcons_op_buf)
+ iounmap(vcons_info.vcons_op_buf);
+#if defined(CONFIG_MK1OM)
+ free_page((unsigned long)vcons_info.vcons_ip_buf);
+ kfree(vcons_info.mic_hdr);
+#else
+ free_page((unsigned long)vcons_info.mic_hdr);
+#endif
+ return err;
+}
+
+static void __exit hvc_mic_exit(void)
+{
+ char buf[8];
+ int ret, len;
+
+ writel(0, &vcons_info.hdr->mic_magic);
+
+ do {
+ len = micscif_rb_count(&mic_in_buf, sizeof(buf));
+ ret = micscif_rb_get_next(&mic_in_buf, buf,
+ min(len, (int)sizeof(buf)));
+ } while (ret > 0);
+
+ iounmap(vcons_info.hdr);
+ iounmap(vcons_info.vcons_op_buf);
+#if defined(CONFIG_MK1OM)
+ free_page((unsigned long)vcons_info.vcons_ip_buf);
+ kfree(vcons_info.mic_hdr);
+#else
+ free_page((unsigned long)vcons_info.mic_hdr);
+#endif
+ if (hp)
+ hvc_remove(hp);
+}
+
+MODULE_PARM_DESC(vcons_hdr_addr, "mic address of vcons hdr");
+module_param(vcons_hdr_addr, long, S_IRUGO);
+module_param(dbg, int, S_IRUGO);
+MODULE_LICENSE("GPL");
+module_init(hvc_mic_init);
+module_exit(hvc_mic_exit);
+
--- /dev/null
+obj-m += mic_virtblk.o
+
--- /dev/null
+/*
+ virtio block device adapted for MIC.
+ copied from drivers/block/virtio_blk.c of Linux kernel
+ It is initially commited by
+ Rusty Russell <rusty@rustcorp.com.au> 2007-10-21 18:03:38
+ with SHA1 ID, e467cde238184d1b0923db2cd61ae1c5a6dc15aa
+
+ drivers/block/virtio_blk.c of Linux kernel does not have copyright notice.
+
+ * For adapting to MIC
+ * (C) Copyright 2012 Intel Corporation
+ * Author: Caz Yokoyama <Caz.Yokoyama@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ */
+//#define DEBUG
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_blk.h>
+#include <linux/scatterlist.h>
+#include <linux/list.h>
+#include "mic_common.h"
+#include "mic/micveth_dma.h"
+#include "mic/micscif_intr.h"
+#include "mic/mic_virtio.h"
+
+#define SBOX_MMIO_LENGTH (64 * 1024)
+
+#define PART_BITS 4
+
+#define VIRTQUEUE_LENGTH 128
+#define MIC_VRING_ALIGN PAGE_SIZE
+
+#define INTERRUPT_ID_FOR_VIRTBLK 3
+
+extern int get_sbox_irq(int index);
+
+static int major, index = 0;
+static long virtio_addr = 0;
+static mic_data_t virtblk_mic_data;
+
+struct virtio_blk
+{
+ spinlock_t lock;
+
+ struct virtio_device *vdev;
+ struct virtqueue *vq;
+
+ /* The disk structure for the kernel. */
+ struct gendisk *disk;
+
+ /* Request tracking. */
+ struct list_head reqs;
+
+ mempool_t *pool;
+
+ /* virtual address of blk_config */
+ void __iomem *ioaddr;
+
+ /* What host tells us, plus 2 for header & tailer. */
+ unsigned int sg_elems;
+
+ /* sbox va */
+ u8 *sbox;
+
+ /* Scatterlist: can be too big for stack. */
+ struct scatterlist sg[/*sg_elems*/];
+};
+
+struct virtblk_req
+{
+ struct list_head list;
+ struct request *req;
+ struct virtio_blk_outhdr out_hdr;
+ struct virtio_scsi_inhdr in_hdr;
+ u8 status;
+};
+
+#define blk_pc_request(rq) ((rq)->cmd_type == REQ_TYPE_BLOCK_PC)
+
+/* The following vring_virtqueue and to_vvq() are copied from virtio_ring.c. Please name sure you have the same structure
+ as in virtio_ring.c. The reason why they are copied is that I don't want to change virtio_ring.c which is a symbolic link.
+*/
+struct vring_virtqueue
+{
+ struct virtqueue vq;
+
+ /* Actual memory layout for this queue */
+ struct vring vring;
+
+ /* Other side has made a mess, don't try any more. */
+ bool broken;
+
+ /* Host supports indirect buffers */
+ bool indirect;
+
+ /* Number of free buffers */
+ unsigned int num_free;
+ /* Head of free buffer list. */
+ unsigned int free_head;
+ /* Number we've added since last sync. */
+ unsigned int num_added;
+
+ /* Last used index we've seen. */
+ u16 last_used_idx;
+
+ /* How to notify other side. FIXME: commonalize hcalls! */
+ void (*notify)(struct virtqueue *vq);
+
+#ifdef DEBUG
+ /* They're supposed to lock for us. */
+ unsigned int in_use;
+#endif
+
+ struct _mic_ctx_t *mic_ctx;
+ /* Tokens for callbacks. */
+ void *data[];
+};
+
+#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
+
+static void blk_done(struct virtqueue *vq)
+{
+ struct virtio_blk *vblk = vq->vdev->priv;
+ struct virtblk_req *vbr;
+ unsigned int len;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vblk->lock, flags);
+ while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
+ int error;
+
+ switch (vbr->status) {
+ case VIRTIO_BLK_S_OK:
+ error = 0;
+ break;
+ case VIRTIO_BLK_S_UNSUPP:
+ error = -ENOTTY;
+ break;
+ default:
+ error = -EIO;
+ break;
+ }
+
+ if (blk_pc_request(vbr->req)) {
+ vbr->req->resid_len = vbr->in_hdr.residual;
+ vbr->req->sense_len = vbr->in_hdr.sense_len;
+ vbr->req->errors = vbr->in_hdr.errors;
+ }
+
+ __blk_end_request_all(vbr->req, error);
+ list_del(&vbr->list);
+ mempool_free(vbr, vblk->pool);
+ }
+ /* In case queue is stopped waiting for more buffers. */
+ blk_start_queue(vblk->disk->queue);
+ spin_unlock_irqrestore(&vblk->lock, flags);
+}
+
+static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
+ struct request *req)
+{
+ unsigned long num, out = 0, in = 0;
+ struct virtblk_req *vbr;
+
+ vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
+ if (!vbr)
+ /* When another request finishes we'll try again. */
+ return false;
+
+ vbr->req = req;
+
+ if (req->cmd_flags & REQ_FLUSH) {
+ vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
+ vbr->out_hdr.sector = 0;
+ vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
+ } else {
+ switch (req->cmd_type) {
+ case REQ_TYPE_FS:
+ vbr->out_hdr.type = 0;
+ vbr->out_hdr.sector = blk_rq_pos(vbr->req);
+ vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
+ break;
+ case REQ_TYPE_BLOCK_PC:
+ vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
+ vbr->out_hdr.sector = 0;
+ vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
+ break;
+ case REQ_TYPE_SPECIAL:
+ vbr->out_hdr.type = VIRTIO_BLK_T_GET_ID;
+ vbr->out_hdr.sector = 0;
+ vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
+ break;
+ default:
+ /* We don't put anything else in the queue. */
+ BUG();
+ }
+ }
+
+ sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
+
+ /*
+ * If this is a packet command we need a couple of additional headers.
+ * Behind the normal outhdr we put a segment with the scsi command
+ * block, and before the normal inhdr we put the sense data and the
+ * inhdr with additional status information before the normal inhdr.
+ */
+ if (blk_pc_request(vbr->req))
+ sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
+
+ num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
+
+ if (blk_pc_request(vbr->req)) {
+ sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, 96);
+ sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
+ sizeof(vbr->in_hdr));
+ }
+
+ sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
+ sizeof(vbr->status));
+
+ if (num) {
+ if (rq_data_dir(vbr->req) == WRITE) {
+ vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
+ out += num;
+ } else {
+ vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
+ in += num;
+ }
+ }
+
+ if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr) < 0) {
+ mempool_free(vbr, vblk->pool);
+ return false;
+ }
+
+ list_add_tail(&vbr->list, &vblk->reqs);
+ return true;
+}
+
+static void do_virtblk_request(struct request_queue *q)
+{
+ struct virtio_blk *vblk = q->queuedata;
+ struct request *req;
+ unsigned int issued = 0;
+
+ while ((req = blk_peek_request(q)) != NULL) {
+ BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
+
+ /* If this request fails, stop queue and wait for something to
+ finish to restart it. */
+ if (!do_req(q, vblk, req)) {
+ blk_stop_queue(q);
+ break;
+ }
+ blk_start_request(req);
+ issued++;
+ }
+
+ if (issued)
+ virtqueue_kick(vblk->vq);
+}
+
+static int
+set_capacity_from_host(struct virtio_blk *vblk)
+{
+ struct virtio_device *vdev = vblk->vdev;
+ u64 cap;
+
+ /* Host must always specify the capacity. */
+ vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity),
+ &cap, sizeof(cap));
+ if (cap == 0) {
+ printk(KERN_ERR "Have you set virtblk file?\n");
+ return -ENXIO;
+ }
+
+ /* If capacity is too big, truncate with warning. */
+ if ((sector_t)cap != cap) {
+ dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
+ (unsigned long long)cap);
+ cap = (sector_t)-1;
+ }
+ set_capacity(vblk->disk, cap);
+
+ return 0;
+}
+
+static int
+virtblk_open(struct block_device *bdev, fmode_t mode)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ struct virtio_blk *vblk = disk->private_data;
+
+ return set_capacity_from_host(vblk);
+}
+
+static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned cmd, unsigned long data)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ struct virtio_blk *vblk = disk->private_data;
+
+ /*
+ * Only allow the generic SCSI ioctls if the host can support it.
+ */
+ if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
+ return -ENOTTY;
+
+ return scsi_cmd_ioctl(disk->queue, disk, mode, cmd,
+ (void __user *)data);
+}
+
+/* We provide getgeo only to please some old bootloader/partitioning tools */
+static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
+{
+ struct virtio_blk *vblk = bd->bd_disk->private_data;
+ struct virtio_blk_geometry vgeo;
+ int err;
+
+ /* see if the host passed in geometry config */
+ err = virtio_config_val(vblk->vdev, VIRTIO_BLK_F_GEOMETRY,
+ offsetof(struct virtio_blk_config, geometry),
+ &vgeo);
+
+ if (!err) {
+ geo->heads = vgeo.heads;
+ geo->sectors = vgeo.sectors;
+ geo->cylinders = vgeo.cylinders;
+ } else {
+ /* some standard values, similar to sd */
+ geo->heads = 1 << 6;
+ geo->sectors = 1 << 5;
+ geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+ }
+ return 0;
+}
+
+static const struct block_device_operations virtblk_fops = {
+ .open = virtblk_open,
+ .ioctl = virtblk_ioctl,
+ .owner = THIS_MODULE,
+ .getgeo = virtblk_getgeo,
+};
+
+static int index_to_minor(int index)
+{
+ return index << PART_BITS;
+}
+
+static inline bool more_used(const struct vring_virtqueue *vq)
+{
+ return vq->last_used_idx != vq->vring.used->idx;
+}
+
+static irqreturn_t
+mic_virtblk_intr_handler(int irq, void *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (!more_used(vq)) {
+ pr_debug("virtqueue interrupt with no work for %p\n", vq);
+ goto _exit_;
+ }
+
+ if (unlikely(vq->broken))
+ goto _exit_;
+
+ pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
+ if (vq->vq.callback)
+ vq->vq.callback(&vq->vq);
+
+ _exit_:
+ return IRQ_HANDLED;
+}
+
+static int __devinit virtblk_probe(struct virtio_device *vdev)
+{
+ struct virtio_blk *vblk;
+ struct request_queue *q;
+ int err;
+ u32 v, blk_size, sg_elems, opt_io_size;
+ u16 min_io_size;
+ u8 physical_block_exp, alignment_offset;
+ struct board_info *bd_info = virtblk_mic_data.dd_bi[0];
+ struct vb_shared *vb_shared;
+
+ if (index_to_minor(index) >= 1 << MINORBITS)
+ return -ENOSPC;
+
+ vb_shared = ((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared;
+ vdev->features[0] = readl(&vb_shared->host_features);
+
+ /* We need to know how many segments before we allocate. */
+ err = virtio_config_val(vdev, VIRTIO_BLK_F_SEG_MAX,
+ offsetof(struct virtio_blk_config, seg_max),
+ &sg_elems);
+ if (err)
+ sg_elems = 1;
+
+ /* We need an extra sg elements at head and tail. */
+ sg_elems += 2;
+ vdev->priv = vblk = kmalloc(sizeof(*vblk) +
+ sizeof(vblk->sg[0]) * sg_elems, GFP_KERNEL);
+ if (!vblk) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&vblk->reqs);
+ spin_lock_init(&vblk->lock);
+ vblk->vdev = vdev;
+ vblk->sg_elems = sg_elems;
+ sg_init_table(vblk->sg, vblk->sg_elems);
+
+ /* map sbox */
+ vblk->sbox = ioremap_nocache(SBOX_BASE, SBOX_MMIO_LENGTH);
+ if (!vblk->sbox) {
+ printk(KERN_ERR "%s: NULL SBOX ptr\n", __func__);
+ err = -ENOMEM;
+ goto out_free_vblk;
+ }
+
+ /* We expect one virtqueue, for output. */
+ vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests");
+ if (IS_ERR(vblk->vq)) {
+ err = PTR_ERR(vblk->vq);
+ goto out_unmap_sbox;
+ }
+
+ if ((err = request_irq(get_sbox_irq(VIRTIO_SBOX_INT_IDX),
+ mic_virtblk_intr_handler, IRQF_DISABLED,
+ "virtio intr", vblk->vq))) {
+ printk(KERN_ERR "%s: can't register interrupt: %d\n", __func__, err);
+ goto out_free_vq;
+ }
+
+ vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
+ if (!vblk->pool) {
+ err = -ENOMEM;
+ goto out_free_irq;
+ }
+
+ /* FIXME: How many partitions? How long is a piece of string? */
+ vblk->disk = alloc_disk(1 << PART_BITS);
+ if (!vblk->disk) {
+ err = -ENOMEM;
+ goto out_mempool;
+ }
+
+ q = vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock);
+ if (!q) {
+ err = -ENOMEM;
+ goto out_put_disk;
+ }
+
+ q->queuedata = vblk;
+
+ if (index < 26) {
+ sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26);
+ } else if (index < (26 + 1) * 26) {
+ sprintf(vblk->disk->disk_name, "vd%c%c",
+ 'a' + index / 26 - 1, 'a' + index % 26);
+ } else {
+ const unsigned int m1 = (index / 26 - 1) / 26 - 1;
+ const unsigned int m2 = (index / 26 - 1) % 26;
+ const unsigned int m3 = index % 26;
+ sprintf(vblk->disk->disk_name, "vd%c%c%c",
+ 'a' + m1, 'a' + m2, 'a' + m3);
+ }
+
+ vblk->disk->major = major;
+ vblk->disk->first_minor = index_to_minor(index);
+ vblk->disk->private_data = vblk;
+ vblk->disk->fops = &virtblk_fops;
+ vblk->disk->driverfs_dev = NULL; // There is no parent device.
+ index++;
+
+ /* configure queue flush support */
+ if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
+ blk_queue_flush(q, REQ_FLUSH);
+
+ /* If disk is read-only in the host, the guest should obey */
+ if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) {
+ if (vdev->config->get_features(vdev) & (1U << VIRTIO_BLK_F_RO)) {
+ set_disk_ro(vblk->disk, 1);
+ }
+ }
+
+ err = set_capacity_from_host(vblk);
+ if (err)
+ goto out_put_disk;
+
+ /* We can handle whatever the host told us to handle. */
+ blk_queue_max_segments(q, vblk->sg_elems-2);
+
+ /* No need to bounce any requests */
+ blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+
+ /* No real sector limit. */
+ blk_queue_max_hw_sectors(q, -1U);
+
+ /* Host can optionally specify maximum segment size and number of
+ * segments. */
+ err = virtio_config_val(vdev, VIRTIO_BLK_F_SIZE_MAX,
+ offsetof(struct virtio_blk_config, size_max),
+ &v);
+ if (!err)
+ blk_queue_max_segment_size(q, v);
+ else
+ blk_queue_max_segment_size(q, -1U);
+
+ /* Host can optionally specify the block size of the device */
+ err = virtio_config_val(vdev, VIRTIO_BLK_F_BLK_SIZE,
+ offsetof(struct virtio_blk_config, blk_size),
+ &blk_size);
+ if (!err)
+ blk_queue_logical_block_size(q, blk_size);
+ else
+ blk_size = queue_logical_block_size(q);
+
+ /* Use topology information if available */
+ err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
+ offsetof(struct virtio_blk_config, physical_block_exp),
+ &physical_block_exp);
+ if (!err && physical_block_exp)
+ blk_queue_physical_block_size(q,
+ blk_size * (1 << physical_block_exp));
+
+ err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
+ offsetof(struct virtio_blk_config, alignment_offset),
+ &alignment_offset);
+ if (!err && alignment_offset)
+ blk_queue_alignment_offset(q, blk_size * alignment_offset);
+
+ err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
+ offsetof(struct virtio_blk_config, min_io_size),
+ &min_io_size);
+ if (!err && min_io_size)
+ blk_queue_io_min(q, blk_size * min_io_size);
+
+ err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
+ offsetof(struct virtio_blk_config, opt_io_size),
+ &opt_io_size);
+ if (!err && opt_io_size)
+ blk_queue_io_opt(q, blk_size * opt_io_size);
+
+ add_disk(vblk->disk);
+ return 0;
+
+out_put_disk:
+ put_disk(vblk->disk);
+out_mempool:
+ mempool_destroy(vblk->pool);
+out_free_irq:
+ free_irq(get_sbox_irq(VIRTIO_SBOX_INT_IDX), vblk->vq);
+out_free_vq:
+ vdev->config->del_vqs(vdev);
+out_unmap_sbox:
+ iounmap(vblk->sbox);
+out_free_vblk:
+ kfree(vblk);
+out:
+ return err;
+}
+
+static void __devexit virtblk_remove(struct virtio_device *vdev)
+{
+ struct virtio_blk *vblk = vdev->priv;
+
+ /* Nothing should be pending. */
+ BUG_ON(!list_empty(&vblk->reqs));
+
+ free_irq(get_sbox_irq(VIRTIO_SBOX_INT_IDX), vblk->vq);
+
+ /* Stop all the virtqueues. */
+ vdev->config->reset(vdev);
+
+ del_gendisk(vblk->disk);
+ blk_cleanup_queue(vblk->disk->queue);
+ put_disk(vblk->disk);
+ mempool_destroy(vblk->pool);
+ vdev->config->del_vqs(vdev);
+ iounmap(vblk->sbox);
+ kfree(vblk);
+}
+
+/* config->get_features() implementation */
+static u32 virtblk_get_features(struct virtio_device *vdev)
+{
+ /* When someone needs more than 32 feature bits, we'll need to
+ * steal a bit to indicate that the rest are somewhere else. */
+ struct board_info *bd_info = virtblk_mic_data.dd_bi[0];
+ struct vb_shared *vb_shared;
+
+ vb_shared = ((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared;
+ return readl(&vb_shared->host_features);
+}
+
+/* virtio config->finalize_features() implementation */
+static void virtblk_finalize_features(struct virtio_device *vdev)
+{
+ struct board_info *bd_info = virtblk_mic_data.dd_bi[0];
+ struct vb_shared *vb_shared;
+
+ /* Give virtio_ring a chance to accept features. */
+ vring_transport_features(vdev);
+
+ /* We only support 32 feature bits. */
+ BUILD_BUG_ON(ARRAY_SIZE(vdev->features) != 1);
+
+ vb_shared = ((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared;
+ writel(vdev->features[0], &vb_shared->client_features);
+}
+
+/* config->get() implementation */
+static void virtblk_get(struct virtio_device *vdev, unsigned offset,
+ void *buf, unsigned len)
+{
+ struct board_info *bd_info = virtblk_mic_data.dd_bi[0];
+ struct vb_shared *vb_shared;
+ void *ioaddr;
+ u8 *ptr = buf;
+ int i;
+
+ vb_shared = ((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared;
+ ioaddr = (void *)&vb_shared->blk_config + offset;
+ for (i = 0; i < len; i++)
+ ptr[i] = readb(ioaddr + i);
+}
+
+static void virtblk_reset(struct virtio_device *vdev)
+{
+}
+
+/* the notify function used when creating a virt queue */
+static void virtblk_notify(struct virtqueue *vq)
+{
+ const int doorbell = 2;
+ struct virtio_blk *vblk = vq->vdev->priv;
+ uint32_t db_reg;
+
+ /* Ring host doorbell interrupt */
+ db_reg = readl(vblk->sbox + (SBOX_SDBIC0 + (4 * doorbell)))
+ | SBOX_SDBIC0_DBREQ_BIT;
+ writel(db_reg, vblk->sbox + (SBOX_SDBIC0 + (4 * doorbell)));
+}
+
+/* the config->del_vqs() implementation */
+static void virtblk_del_vqs(struct virtio_device *vdev)
+{
+ struct virtio_blk *vblk = vdev->priv;
+ unsigned long size;
+
+ size = PAGE_ALIGN(vring_size(VIRTQUEUE_LENGTH, MIC_VRING_ALIGN));
+ free_pages_exact(vblk->vq->priv, size);
+
+ vring_del_virtqueue(vblk->vq);
+ vblk->vq = NULL;
+}
+
+/* the config->find_vqs() implementation */
+static int virtblk_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+ struct virtqueue *vqs[],
+ vq_callback_t *callbacks[],
+ const char *names[])
+{
+ struct virtio_blk *vblk = vdev->priv;
+ struct virtqueue *vq;
+ int err;
+ unsigned long size;
+ void *queue; /* the virtual address of the ring queue */
+ struct vring_virtqueue *vvq;
+ struct vring *vring;
+ struct board_info *bd_info = virtblk_mic_data.dd_bi[0];
+
+ BUG_ON(nvqs != 1);
+ BUG_ON(vblk == NULL);
+
+ size = PAGE_ALIGN(vring_size(VIRTQUEUE_LENGTH, MIC_VRING_ALIGN));
+ queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
+ if (queue == NULL) {
+ err = -ENOMEM;
+ goto out_info;
+ }
+
+ /* create the vring */
+ vq = vring_new_virtqueue(VIRTQUEUE_LENGTH, MIC_VRING_ALIGN,
+ vdev, queue, virtblk_notify, callbacks[0], names[0]);
+ if (vq == NULL) {
+ err = -ENOMEM;
+ goto out_activate_queue;
+ }
+ vq->priv = queue;
+
+ vqs[0] = vblk->vq = vq;
+
+ vvq = to_vvq(vq);
+ vring = &((struct mic_virtblk *)bd_info->bi_virtio)->vb_shared->vring;
+ writel(vvq->vring.num, &vring->num);
+ writeq(virt_to_phys(vvq->vring.desc), &vring->desc);
+ writeq(virt_to_phys(vvq->vring.avail), &vring->avail);
+ writeq(virt_to_phys(vvq->vring.used), &vring->used);
+
+ return 0;
+
+out_activate_queue:
+ free_pages_exact(queue, size);
+out_info:
+ return err;
+}
+
+static struct virtio_config_ops virtio_blk_config_ops = {
+ .get = virtblk_get,
+ // .set = vp_set,
+ // .get_status = vp_get_status,
+ // .set_status = vp_set_status,
+ .reset = virtblk_reset,
+ .find_vqs = virtblk_find_vqs,
+ .del_vqs = virtblk_del_vqs,
+ .get_features = virtblk_get_features,
+ .finalize_features = virtblk_finalize_features,
+};
+
+static unsigned int features[] = {
+ VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
+ VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
+ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
+};
+
+/*
+ * virtio_blk causes spurious section mismatch warning by
+ * simultaneously referring to a __devinit and a __devexit function.
+ * Use __refdata to avoid this warning.
+ */
+static struct virtio_driver __refdata virtio_blk = {
+ .feature_table = features,
+ .feature_table_size = ARRAY_SIZE(features),
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+};
+
+struct class block_class = {
+ .name = "block",
+};
+
+static struct device_type disk_type = {
+ .name = "disk",
+ /*
+ .groups = disk_attr_groups,
+ .release = disk_release,
+ .devnode = block_devnode,
+ */
+};
+
+static int __init init(void)
+{
+ bd_info_t *bd_info;
+ struct virtio_device *vdev;
+ struct mic_virtblk *mic_virtblk;
+ int ret;
+ struct vb_shared *vb_shared;
+
+#ifdef CONFIG_ML1OM
+ printk(KERN_ERR "virtio block device is not available on KNF\n");
+ ret = -ENODEV;
+ goto error_return;
+#endif
+ major = register_blkdev(0, "virtblk");
+ if (major < 0) {
+ ret = major;
+ goto error_return;
+ }
+
+ bd_info = kmalloc(sizeof(bd_info_t), GFP_KERNEL);
+ if (bd_info == NULL) {
+ ret = -ENOMEM;
+ goto error_return;
+ }
+ memset(bd_info, 0, sizeof(*bd_info));
+ virtblk_mic_data.dd_numdevs = 1;
+ index = 0;
+ virtblk_mic_data.dd_bi[0] = bd_info;
+ bd_info->bi_ctx.bi_id = 0;
+
+ mic_virtblk = kmalloc(sizeof(*mic_virtblk), GFP_KERNEL);
+ if (mic_virtblk == NULL) {
+ ret = -ENOMEM;
+ goto free_bd_info;
+ }
+ memset(mic_virtblk, 0, sizeof(*mic_virtblk));
+ bd_info->bi_virtio = (void *)mic_virtblk;
+
+ if (virtio_addr == 0) {
+ printk(KERN_ERR "virtio address is not passed from host\n");
+ return -ENODEV;
+ goto free_mic_virtblk;
+ }
+ vb_shared = ioremap_nocache(virtio_addr, sizeof(*vb_shared));
+ if (vb_shared == NULL) {
+ ret = -ENODEV;
+ goto free_mic_virtblk;
+ }
+ vb_shared->update = true;
+ mic_virtblk->vb_shared = vb_shared;
+
+ vdev = kmalloc(sizeof(*vdev), GFP_KERNEL);
+ if (vdev == NULL) {
+ ret = -ENOMEM;
+ goto free_mic_virtblk;
+ }
+ memset(vdev, 0, sizeof(*vdev));
+ vdev->config = &virtio_blk_config_ops;
+ INIT_LIST_HEAD(&vdev->vqs);
+ vdev->dev.driver = &virtio_blk.driver;
+ vdev->dev.class = &block_class;
+ vdev->dev.type = &disk_type;
+ device_initialize(&vdev->dev);
+ mic_virtblk->vdev = (void *)vdev;
+
+ return virtblk_probe(vdev);
+
+ free_mic_virtblk:
+ kfree(bd_info->bi_virtio);
+ free_bd_info:
+ kfree(bd_info);
+ error_return:
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ bd_info_t *bd_info = virtblk_mic_data.dd_bi[0];
+ struct mic_virtblk *mic_virtblk = (struct mic_virtblk *)bd_info->bi_virtio;
+
+ unregister_blkdev(major, "virtblk");
+ virtblk_remove(mic_virtblk->vdev);
+ iounmap(mic_virtblk->vb_shared);
+ kfree(mic_virtblk->vdev);
+ kfree(bd_info->bi_virtio);
+ kfree(bd_info);
+}
+module_init(init);
+module_exit(fini);
+
+MODULE_DESCRIPTION("Virtio block driver");
+MODULE_LICENSE("GPL");
+MODULE_PARM_DESC(virtio_addr, "address of virtio related structure");
+module_param(virtio_addr, long, S_IRUGO);
--- /dev/null
+obj-m += intel_micveth.o
+
+intel_micveth-objs := micveth.o micveth_param.o micveth_dma.o
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#ifndef MICDLDR_H
+#define MICDLDR_H
+
+#define MIC_DECONS_DISABLE 0
+#define MIC_DECONS_ENABLE 1
+
+typedef struct mic_upload {
+ int up_brdnum;
+ int up_uossize;
+ char *up_uosbuf;
+ int up_dcons;
+ int up_uoslog;
+ int up_uosreserve;
+} mic_upload_t;
+
+typedef struct mic_sys_config {
+ int sc_numCards;
+} mic_sys_config_t;
+
+#define UOS_NOT_BOOTED 0
+#define UOS_BOOTING 1
+#define UOS_BOOT_FAILED 2
+#define UOS_BOOT_SUCCEED 3
+#define UOS_RUNNING 4
+#define UOS_WEDGED 5
+#define UOS_UNKNOWN 6
+
+#define PCI_VENDOR_INTEL 0x8086
+
+#define PCI_SPEED_GEN1 1
+#define PCI_SPEED_GEN2 2
+
+#define GDDR_VENDOR_SAMSUNG 1
+#define GDDR_VENDOR_QIMONDA 2
+#define GDDR_VENDOR_HYNIX 6
+
+#define GDDR_DENSITY_512MB 0
+#define GDDR_DENSITY_1GB 1
+
+typedef struct mic_brd_config {
+ int bc_brdnum;
+ struct {
+ char step[4];
+ int freqMhz;
+ int vid;
+ int uvolts;
+ } bc_core;
+ struct {
+ unsigned short vendor;
+ unsigned short device;
+ unsigned int class;
+ char capableSpeed;
+ char capableWidth;
+ char currentSpeed;
+ char currentWidth;
+ } bc_pcie;
+ struct {
+ char vendor;
+ char density;
+ char fifoDepth;
+ short freq; // MT/sec
+ int size; // Mbytes
+ } bc_gddr;
+ int bc_uOSstate;
+} mic_brd_config_t;
+
+#define MIC_UPLOAD_UOS _IOWR('l', 1, struct mic_upload)
+#define MIC_RESET_UOS _IOWR('l', 2, int)
+#define MIC_SYS_CONFIG _IOWR('l', 3, struct mic_sys_config)
+#define MIC_BRD_CONFIG _IOWR('l', 4, struct mic_brd_config)
+
+#endif // MICDLDR_H
+
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/list.h>
+
+#include "mic/micveth.h"
+
+#define PWR_MGMT_NO_POLL_AFTER_LINKS_UP 1
+
+/* #define HOST */
+#define SBOX_MMIO_LENGTH (64 * 1024)
+
+/* Host - Card link initialization rotocol
+ * Card comes up and writes MICVETH_LINK_UP_MAGIC to scratch 14 & 15
+ * Host detects that the card side interface is up and writes the
+ * 1) address of the tx/rx descriptor ring buffer to scratch 14 & 15
+ * 2) last 2 octets of the MAC address (allows the host to identify
+ * the board number based on its mac address)
+ */
+
+/* Host - Card descriptor queue/ring buffer (from the perspective of the host)
+ *
+ * There is a transmit and a receive queue. Each queue entry has
+ * a physical address and a length.
+ *
+ * Packet transmission
+ * The host adds a queue entry with the physical address of the skb and its
+ * length and updates the write pointer. The receive side on the card sees the
+ * new entry, allocates a new skb, maps the host's skb, copies it to a locally
+ * allocated skb and updates the read pointer. The host side later frees up skbs
+ * starting from a cached read pointer upto the read pointer
+ *
+ * Packet reception
+ * The host "posts" skbs to the rx queue. The transmit routine on the card
+ * copies its local skb to the host skb, updates the write pointer and frees
+ * its local skb
+ */
+
+/* Vnet interrupts are now functional (with vnet=dma module parameter). In the
+ main flow of the driver all polling in the interrupt mode has been
+ eliminated. However, polling is still happening in clientpoll() routine which
+ tracks if the link is up or down. This can also be replaced by an interrupt
+ driven mechanism which will be done in the future. Apart from this, only
+ limited testing has been done in the interrupt mode, especially with respect
+ to sharing the interrupt with scif. Therefore, for now the default mode of
+ operation is still left as poll in micstart.
+*/
+
+#define SBOX_SDBIC0_DBREQ_BIT 0x80000000
+
+
+#ifdef HOST
+#else
+struct skb_node {
+ struct list_head list;
+ struct sk_buff *skb;
+};
+
+/* List of skbs to be transmitted - global for now assumes KN* has a single interface */
+struct list_head skb_list;
+LIST_HEAD(skb_list);
+#endif
+
+static void _micveth_process_descriptors(micveth_info_t *veth_info);
+
+#ifdef HOST
+#else
+static int micveth_xmit_enqueue(struct sk_buff *skb, struct net_device *dev, micveth_info_t *veth_info);
+static int micveth_xmit_dequeue(struct net_device *dev, micveth_info_t *veth_info);
+static struct sk_buff *dequeue_skb(micveth_info_t *veth_info);
+static void micvnet_tx_dequeue_handler(struct work_struct *work);
+
+int micveth_start(mic_ctx_t *mic_ctx);
+void micveth_stop(mic_ctx_t *mic_ctx);
+static int micveth_start_dev(struct net_device *dev);
+static int micveth_stop_dev(struct net_device *dev);
+#endif
+
+static void micveth_clientpoll(struct work_struct *work);
+static void micveth_poll(struct work_struct *work);
+static irqreturn_t micvnet_host_intr_handler(int irq, void *cookie);
+static void micvnet_intr_bh_handler(struct work_struct *work);
+static void micveth_send_intr(micveth_info_t *veth_info);
+int get_sbox_irq(int index);
+
+#ifdef HOST
+#else
+static mic_ctx_t mic_ctx_g;
+#endif
+
+micveth_t micveth;
+
+static int
+micveth_set_address(struct net_device *dev, void *p)
+{
+ struct sockaddr *sa = p;
+
+ if (!is_valid_ether_addr(sa->sa_data))
+ return -EADDRNOTAVAIL;
+
+ memcpy(dev->dev_addr, sa->sa_data, ETH_ALEN);
+ return 0;
+}
+
+static void
+micveth_multicast_list(struct net_device *dev)
+{
+}
+
+#ifdef HOST
+#else
+/* Enqueues an skb for transmission. This is necessary because micveth_xmit is called in
+ interrupt context and we cannot call ioremap_nocache from interrupt context. */
+static int
+micveth_xmit_enqueue(struct sk_buff *skb, struct net_device *dev, micveth_info_t *veth_info)
+{
+ struct skb_node *new_node = kmalloc(sizeof(*new_node), GFP_ATOMIC);
+
+ if (!new_node)
+ return ENOMEM;
+ new_node->skb = skb;
+ spin_lock(&veth_info->vi_txlock);
+ list_add_tail(&new_node->list, &skb_list);
+ spin_unlock(&veth_info->vi_txlock);
+ return 0;
+}
+
+/* Dequeues a skb enqueued by micveth_xmit_enqueue */
+static struct sk_buff *
+dequeue_skb(micveth_info_t *veth_info)
+{
+ struct sk_buff *skb = NULL;
+ struct skb_node *skb_node = NULL;
+
+ spin_lock_bh(&veth_info->vi_txlock);
+ if (!list_empty(&skb_list))
+ {
+ skb_node = list_entry(skb_list.next, struct skb_node , list);
+ list_del(&skb_node->list);
+ skb = skb_node->skb;
+ }
+ spin_unlock_bh(&veth_info->vi_txlock);
+
+ if (skb_node)
+ kfree(skb_node);
+ return skb;
+}
+
+/* Transmits skbs that have been enqueued by the by micveth_xmit_enqueue */
+static int
+micveth_xmit_dequeue(struct net_device *dev, micveth_info_t *veth_info)
+{
+ veth_ring_t *ring;
+ ring_queue_t *tx_queue;
+ ring_desc_t *desc;
+ int next_tail;
+ void *dst;
+ struct sk_buff *skb;
+
+ while ((skb = dequeue_skb(veth_info))) {
+ ring = veth_info->ring_ptr;
+ tx_queue = &ring->r_rx;
+
+ next_tail = (tx_queue->rq_tail + 1) % tx_queue->rq_length;
+ if (next_tail == tx_queue->rq_head) {
+ printk(KERN_WARNING "dropping packet\n");
+ /* queue_full situation - just drop the packet and let the stack retry */
+ return 1;
+ }
+
+ desc = &tx_queue->rq_descs[tx_queue->rq_tail];
+ dst = ioremap_nocache(desc->rd_phys, skb->len);
+ if (!dst) {
+ tx_queue->rq_tail = (tx_queue->rq_tail + 1) % tx_queue->rq_length;
+ dev_kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ continue;
+ }
+ desc->rd_length = skb->len;
+ desc->rd_valid = 1;
+ memcpy(dst, skb->data, skb->len);
+ /*
+ * Need a write memory barrier between copying the skb data to
+ * the buffer and updating the tail pointer. NOT an smp_wmb(),
+ * because this memory barrier needs to be done even if there is
+ * a single CPU in the system.
+ *
+ * No need for the serializing request (Si bug workaround in
+ * KNF), since the buffer exists in host memory. If the buffer
+ * lives in card memory, and this code is running on the host, we
+ * would need extra barriers and a "serializing request" on any write.
+ */
+ wmb();
+ tx_queue->rq_tail = (tx_queue->rq_tail + 1) % tx_queue->rq_length;
+ iounmap(dst);
+ dev_kfree_skb(skb);
+
+ if (mic_vnet_mode == VNET_MODE_INTR) {
+ micveth_send_intr(veth_info);
+ }
+ }
+
+ return 0;
+}
+
+static void
+micvnet_tx_dequeue_handler(struct work_struct *work)
+{
+ micveth_info_t *veth_info = container_of(work, micveth_info_t, vi_txws);
+ struct net_device *dev_veth = veth_info->vi_netdev;
+
+ micveth_xmit_dequeue(dev_veth, veth_info);
+}
+#endif
+
+#ifdef HOST
+#else // card
+/* Transmit callback */
+static int
+micveth_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ micveth_info_t *veth_info;
+
+ if (be16_to_cpu(skb->protocol) == ETH_P_IPV6) {
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+ }
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+
+ veth_info = &micveth.lv_info[0];
+ if (veth_info->vi_state == VETH_STATE_LINKUP) {
+ if (micveth_xmit_enqueue(skb, dev, veth_info)) {
+ dev_kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ }
+ } else {
+ dev_kfree_skb(skb);
+ }
+
+ /* Reuse the interrupt workqueue to also queue tx dequeue tasks */
+ queue_work(veth_info->vi_wq, &veth_info->vi_txws);
+
+ return NETDEV_TX_OK;
+}
+#endif
+
+static int
+micveth_change_mtu(struct net_device *dev, int new_mtu)
+{
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+
+/* Start callback */
+static int
+micveth_start_dev(struct net_device *dev)
+{
+ micveth_info_t *veth_info = dev->ml_priv;
+
+ micveth_start(veth_info->mic_ctx);
+ return 0;
+}
+
+/* Stop callback */
+static int
+micveth_stop_dev(struct net_device *dev)
+{
+ micveth_info_t *veth_info = dev->ml_priv;
+
+ micveth_stop(veth_info->mic_ctx);
+ return 0;
+}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,28)
+static const struct net_device_ops veth_netdev_ops = {
+ .ndo_open = micveth_start_dev,
+ .ndo_stop = micveth_stop_dev,
+ .ndo_start_xmit = micveth_xmit,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_set_multicast_list = micveth_multicast_list,
+ .ndo_set_mac_address = micveth_set_address,
+ .ndo_change_mtu = micveth_change_mtu,
+};
+#endif
+
+static void
+micveth_setup(struct net_device *dev)
+{
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,28)
+ dev->hard_start_xmit = micveth_xmit;
+ dev->set_multicast_list = micveth_multicast_list;
+ dev->set_mac_address = micveth_set_address;
+#endif
+ ether_setup(dev);
+
+ /* Initialize the device structure. */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,28)
+ dev->netdev_ops = &veth_netdev_ops;
+#endif
+ dev->destructor = free_netdev;
+
+ /* Fill in device structure with ethernet-generic values. */
+ dev->mtu = (MICVETH_MAX_PACKET_SIZE);
+ dev->tx_queue_len = 0;
+ dev->flags &= ~IFF_MULTICAST;
+ random_ether_addr(dev->dev_addr);
+}
+
+static int
+micveth_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
+ }
+ return 0;
+}
+
+static struct rtnl_link_ops micveth_link_ops __read_mostly = {
+ .kind = "micveth",
+ .setup = micveth_setup,
+ .validate = micveth_validate,
+};
+
+static int
+micveth_probe_int(micveth_info_t *veth_info, mic_ctx_t *mic_ctx)
+{
+ struct net_device *dev_veth;
+ int err = 0;
+
+ veth_info->vi_sbox = ioremap_nocache(SBOX_BASE, SBOX_MMIO_LENGTH);
+ veth_info->vi_scratch14 = (uint32_t *)(veth_info->vi_sbox + SBOX_SCRATCH14);
+ veth_info->vi_scratch15 = (uint32_t *)(veth_info->vi_sbox + SBOX_SCRATCH14);
+ writel(0x55, veth_info->vi_sbox + SBOX_DCR);
+
+ veth_info->mic_ctx = mic_ctx;
+ mic_ctx->bi_vethinfo = (void *)veth_info;
+
+ spin_lock_init(&veth_info->vi_txlock);
+ spin_lock_init(&veth_info->vi_rxlock);
+
+ if (mic_vnet_mode == VNET_MODE_POLL)
+ INIT_DELAYED_WORK(&veth_info->vi_poll, micveth_poll);
+
+ snprintf(veth_info->vi_wqname, sizeof(veth_info->vi_wqname),
+ "VNET INTR %d", 0);
+ veth_info->vi_wq = create_singlethread_workqueue(veth_info->vi_wqname);
+ INIT_WORK(&veth_info->vi_txws, micvnet_tx_dequeue_handler);
+
+ if (mic_vnet_mode == VNET_MODE_INTR) {
+ if ((err = request_irq(get_sbox_irq(VNET_SBOX_INT_IDX),
+ micvnet_host_intr_handler, IRQF_DISABLED,
+ "micveth intr", veth_info))) {
+ printk(KERN_ERR "%s: interrupt registration failed\n", __func__);
+ return err;
+ }
+ INIT_WORK(&veth_info->vi_bh, micvnet_intr_bh_handler);
+ }
+
+ // Set the current sk_buff allocation size
+ veth_info->vi_skb_mtu = MICVETH_MAX_PACKET_SIZE + 32;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 17, 0)
+ if ((dev_veth = alloc_netdev(sizeof(micveth_info_t), "mic%d", micveth_setup)) == NULL) {
+#else
+ if ((dev_veth = alloc_netdev(sizeof(micveth_info_t), "mic%d", NET_NAME_UNKNOWN, micveth_setup)) == NULL) {
+#endif
+ return -ENOMEM;
+ }
+
+ veth_info->vi_netdev = dev_veth;
+ dev_veth->ml_priv = veth_info;
+ dev_veth->rtnl_link_ops = &micveth_link_ops;
+
+ if ((err = register_netdev(dev_veth)) < 0) {
+ printk("register netdev failed %d\n", err);
+ free_netdev(dev_veth);
+ return err;
+ }
+
+ veth_info->vi_state = VETH_STATE_INITIALIZED;
+
+ /* Inform host after completing initialization */
+ printk("%s: writing magic to SC14 and SC15\n", __FUNCTION__);
+ writel(MICVETH_LINK_UP_MAGIC, veth_info->vi_sbox + SBOX_SCRATCH14);
+ writel(MICVETH_LINK_UP_MAGIC, veth_info->vi_sbox + SBOX_SCRATCH15);
+
+ return 0;
+}
+
+void
+micveth_remove_int(mic_ctx_t *mic_ctx)
+{
+ micveth_stop(mic_ctx);
+}
+
+static int __init
+micveth_create_int(int num_bds, struct device *dev)
+{
+ int bd;
+ int err = 0;
+
+ printk("micveth_init(%d)\n", num_bds);
+
+ micveth.lv_num_interfaces = num_bds;
+ micveth.lv_num_clients = num_bds;
+ micveth.lv_active_clients = 0;
+ micveth.lv_num_links_remaining = num_bds;
+
+ if ((err = rtnl_link_register(&micveth_link_ops))) {
+ printk(KERN_ERR "%s: rtnl_link_register failed!\n", __func__);
+ return err;
+ }
+
+ // Allocate space for the control of each device in the system.
+ micveth.lv_info = kmalloc(sizeof(micveth_info_t) * num_bds, GFP_KERNEL);
+ if (!micveth.lv_info) {
+ printk(KERN_ERR "%s: micveth_info alloc failed!\n", __func__);
+ return -ENOMEM;
+ }
+
+ // Initialize state mutex. Overloaded use for several fields.
+ mutex_init(&micveth.lv_state_mutex);
+
+ // Setup of timer for probeing active mic clients. When the total active board
+ // count is zero the poll is not running.
+ micveth.lv_pollstate = CLIENT_POLL_STOPPED;
+ INIT_DELAYED_WORK(&micveth.lv_poll, micveth_clientpoll);
+ init_waitqueue_head(&micveth.lv_wq);
+
+ // Init each of the existing boards.
+ for (bd = 0; bd < num_bds; bd++) {
+#ifdef HOST
+ micveth_probe_int(&micveth.lv_info[bd], &mic_data.dd_bi[bd]->bi_ctx);
+#else
+ micveth_probe_int(&micveth.lv_info[bd], &mic_ctx_g);
+#endif
+ }
+
+ return err;
+}
+
+static void
+micveth_exit_int(void)
+{
+ micveth_info_t *veth_info = &micveth.lv_info[0];
+#ifdef HOST
+#endif
+ micveth_stop(veth_info->mic_ctx);
+
+ destroy_workqueue(veth_info->vi_wq);
+ rtnl_link_unregister(&micveth_link_ops);
+
+#ifdef HOST
+#else // card
+ iounmap((void *)veth_info->ring_ptr);
+ iounmap(veth_info->vi_sbox);
+#endif
+
+ kfree(micveth.lv_info);
+}
+
+/* Card side - tell the host that the interface is up */
+static int
+micveth_start_int(mic_ctx_t *mic_ctx)
+{
+ micveth_info_t *veth_info = &micveth.lv_info[mic_ctx->bi_id];
+
+ // Eventuall (very soon) most of the descriptor allocation for a board will be done here
+ if (veth_info->vi_state != VETH_STATE_INITIALIZED)
+ return 0;
+
+ mutex_lock(&micveth.lv_state_mutex);
+
+ if (micveth.lv_pollstate == CLIENT_POLL_STOPPED) {
+ schedule_delayed_work(&micveth.lv_poll, msecs_to_jiffies(MICVETH_CLIENT_TIMER_DELAY));
+ micveth.lv_pollstate = CLIENT_POLL_RUNNING;
+ }
+
+ micveth.lv_active_clients++;
+ mutex_unlock(&micveth.lv_state_mutex);
+
+ veth_info->vi_state = VETH_STATE_LINKDOWN;
+
+ return 0;
+}
+
+/* Card side - tell the host that the interface is down */
+static void
+micveth_stop_int(mic_ctx_t *mic_ctx)
+{
+ micveth_info_t *veth_info = (micveth_info_t *)(mic_ctx->bi_vethinfo);
+
+ if (veth_info->vi_state == VETH_STATE_INITIALIZED)
+ return;
+
+ mutex_lock(&micveth.lv_state_mutex);
+ micveth.lv_active_clients--;
+ veth_info->vi_state = VETH_STATE_INITIALIZED;
+
+ if (micveth.lv_active_clients) {
+ mutex_unlock(&micveth.lv_state_mutex);
+ return;
+ }
+
+ micveth.lv_num_links_remaining = micveth.lv_num_clients;
+
+#if PWR_MGMT_NO_POLL_AFTER_LINKS_UP
+ micveth.lv_pollstate = CLIENT_POLL_STOPPED;
+ mutex_unlock(&micveth.lv_state_mutex);
+#else
+ micveth.lv_pollstate = CLIENT_POLL_STOPPING;
+ mutex_unlock(&micveth.lv_state_mutex);
+ wait_event(micveth.lv_wq, micveth.lv_pollstate == CLIENT_POLL_STOPPED);
+#endif
+
+#ifdef HOST
+#else // card
+ writel(MICVETH_LINK_DOWN_MAGIC, veth_info->vi_sbox + SBOX_SCRATCH14);
+ writel(MICVETH_LINK_DOWN_MAGIC, veth_info->vi_sbox + SBOX_SCRATCH15);
+#endif
+}
+
+#ifdef HOST
+#else // card
+/* Link detection */
+static void
+micveth_clientpoll(struct work_struct *work)
+{
+ micveth_info_t *veth_info;
+ mic_ctx_t *mic_ctx;
+ uint32_t scratch14;
+ uint32_t scratch15;
+ struct net_device *dev_veth;
+ veth_info = &micveth.lv_info[0];
+ dev_veth = veth_info->vi_netdev;
+ mic_ctx = veth_info->mic_ctx;
+ mutex_lock(&micveth.lv_state_mutex);
+
+ if (micveth.lv_pollstate == CLIENT_POLL_STOPPING) {
+ micveth.lv_pollstate = CLIENT_POLL_STOPPED;
+ mutex_unlock(&micveth.lv_state_mutex);
+ wake_up(&micveth.lv_wq);
+ return;
+ }
+
+ if (veth_info->vi_state == VETH_STATE_LINKUP) {
+ scratch14 = readl(veth_info->vi_sbox + SBOX_SCRATCH14);
+ scratch15 = readl(veth_info->vi_sbox + SBOX_SCRATCH15);
+
+ if ((MICVETH_LINK_DOWN_MAGIC == scratch14) &&
+ (MICVETH_LINK_DOWN_MAGIC == scratch15)) {
+ veth_info->vi_state = VETH_STATE_LINKDOWN;
+ }
+ } else {
+ scratch14 = readl(veth_info->vi_sbox + SBOX_SCRATCH14);
+ scratch15 = readl(veth_info->vi_sbox + SBOX_SCRATCH15);
+
+ if ((MICVETH_LINK_UP_MAGIC != scratch14) &&
+ (MICVETH_LINK_UP_MAGIC != scratch15)) {
+ printk("micveth_clientpoll(): SC14 and SC15 changed from MAGIC, I got the RB addresses!\n");
+ writel(MICVETH_LINK_UP_MAGIC, veth_info->vi_sbox + SBOX_SCRATCH14);
+ writel(MICVETH_LINK_UP_MAGIC, veth_info->vi_sbox + SBOX_SCRATCH15);
+ dev_veth->dev_addr[4] = (scratch15 >> 24) & 0xff;
+ dev_veth->dev_addr[5] = (scratch15 >> 16) & 0xff;
+ veth_info->vi_ring.phys = ((uint64_t)(scratch15 & 0xffff) << 32) | scratch14;
+ veth_info->vi_ring.phys |= (1ULL << 39);
+ veth_info->vi_ring.length = sizeof(veth_ring_t);
+ veth_info->ring_ptr = ioremap_nocache(veth_info->vi_ring.phys, veth_info->vi_ring.length);
+ BUG_ON(veth_info->ring_ptr == NULL);
+
+ printk("micveth_clientpoll(): VETH_STATE_LINKUP\n");
+ veth_info->vi_state = VETH_STATE_LINKUP;
+ if (mic_vnet_mode == VNET_MODE_POLL) {
+ printk("micveth_clientpoll(): poll for work now !!\n");
+ schedule_delayed_work(&veth_info->vi_poll, msecs_to_jiffies(MICVETH_POLL_TIMER_DELAY));
+ }
+
+ micveth.lv_num_links_remaining--;
+ }
+ }
+ mutex_unlock(&micveth.lv_state_mutex);
+
+#if PWR_MGMT_NO_POLL_AFTER_LINKS_UP
+ if (micveth.lv_num_links_remaining)
+#endif
+ schedule_delayed_work(&micveth.lv_poll, msecs_to_jiffies(MICVETH_CLIENT_TIMER_DELAY));
+}
+#endif
+extern struct sk_buff *jsp_dbg1;
+
+#ifdef HOST
+#else // card
+static irqreturn_t
+micvnet_host_intr_handler(int irq, void *cookie)
+{
+ micveth_info_t *veth_info = cookie;
+ queue_work(veth_info->vi_wq, &veth_info->vi_bh);
+ return IRQ_HANDLED;
+}
+
+/* Ring host doorbell 3 interrupt */
+static void
+micveth_send_intr(micveth_info_t *veth_info)
+{
+ uint32_t db_reg;
+
+ // Ring host doorbell 3 interrupt
+ db_reg = readl(veth_info->vi_sbox + SBOX_SDBIC3) | SBOX_SDBIC0_DBREQ_BIT;
+ writel(db_reg, veth_info->vi_sbox + SBOX_SDBIC3);
+}
+
+static void
+_micveth_process_descriptors(micveth_info_t *veth_info)
+{
+ veth_ring_t *ring = veth_info->ring_ptr;
+ ring_queue_t *rx_queue = &ring->r_tx;
+ ring_desc_t desc;
+ struct sk_buff *skb;
+ void *pkt;
+ int receive_skb = 0;
+ int err;
+
+ if (veth_info->vi_state != VETH_STATE_LINKUP) {
+ return;
+ }
+
+ spin_lock(&veth_info->vi_rxlock);
+
+ while (rx_queue->rq_head != rx_queue->rq_tail) {
+ desc = rx_queue->rq_descs[rx_queue->rq_head];
+
+ veth_info->vi_netdev->stats.rx_packets++;
+ veth_info->vi_netdev->stats.rx_bytes += desc.rd_length;
+
+ pkt = ioremap_nocache(desc.rd_phys, desc.rd_length);
+ if (pkt == NULL) {
+ veth_info->vi_netdev->stats.rx_dropped++;
+ goto update_ring;
+ }
+
+ /* handle jumbo frame */
+ if (desc.rd_length > ETH_DATA_LEN)
+ skb = dev_alloc_skb(veth_info->vi_skb_mtu);
+ else
+ skb = dev_alloc_skb(ETH_DATA_LEN + 32);
+ if (skb == NULL) {
+ veth_info->vi_netdev->stats.rx_dropped++;
+ iounmap(pkt);
+ goto update_ring;
+ }
+
+ memcpy(skb_put(skb,desc.rd_length), pkt, desc.rd_length);
+ iounmap(pkt);
+ skb->dev = veth_info->vi_netdev;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ skb->ip_summed = CHECKSUM_NONE;
+ local_bh_disable();
+ err = netif_receive_skb(skb);
+ err = err;
+ local_bh_enable();
+ /*
+ * Need a general memory barrier between copying the data from
+ * the buffer and updating the head pointer. It's the general
+ * mb() because we're ordering the read of the data with the write.
+ *
+ * No need for the serializing request (Si bug workaround in
+ * KNF), since the buffer exists in host memory. If the buffer
+ * lives in card memory, and this code is running on the host, we
+ * would need extra barriers and a "serializing request" on any write.
+ */
+ mb();
+update_ring:
+ rx_queue->rq_head = (rx_queue->rq_head + 1) % rx_queue->rq_length;
+ receive_skb++;
+ }
+
+ /* Send intr to TX so that pending SKB's can be freed */
+ if (receive_skb && mic_vnet_mode == VNET_MODE_INTR) {
+ micveth_send_intr(veth_info);
+ }
+
+ spin_unlock(&veth_info->vi_rxlock);
+
+ if (mic_vnet_mode == VNET_MODE_POLL) {
+ schedule_delayed_work(&veth_info->vi_poll, msecs_to_jiffies(MICVETH_POLL_TIMER_DELAY));
+ }
+}
+
+static void
+micvnet_intr_bh_handler(struct work_struct *work)
+{
+ micveth_info_t *veth_info = container_of(work, micveth_info_t, vi_bh);
+ _micveth_process_descriptors(veth_info);
+}
+
+static void
+micveth_poll(struct work_struct *work)
+{
+ micveth_info_t *veth_info = container_of(work, micveth_info_t, vi_poll.work);
+
+ _micveth_process_descriptors(veth_info);
+}
+
+#endif
+
+#ifdef HOST
+#else // card
+static int __init
+micveth_module_init_int(void)
+{
+ mic_ctx_t *mic_ctx = &mic_ctx_g;
+ int ret = 0;
+
+ printk("micveth_probe()\n");
+ memset(mic_ctx, 0, sizeof(*mic_ctx));
+ mic_ctx->bi_id = 0;
+
+ if ((ret = micveth_init(NULL)))
+ return ret;
+ if ((ret = micveth_init_legacy(1, NULL)))
+ return ret;
+
+ return 0;
+}
+
+static void __exit
+micveth_module_exit_int(void)
+{
+ micveth_exit();
+}
+#endif
+
+/*
+ VNET driver public API. These are simply wrappers which either invoke the old
+ interrupt/poll mode functions or the new DMA mode functions. These are temporary and
+ will be phased out with the old interrupt/poll mode so only the DMA mode will be around
+ eventually.
+ */
+int __init
+micveth_init(struct device *dev)
+{
+ if (mic_vnet_mode == VNET_MODE_DMA)
+ return micvnet_init(dev);
+ /* Intr/poll modes use micveth_init_legacy */
+ return 0;
+}
+
+int __init
+micveth_init_legacy(int num_bds, struct device *dev)
+{
+ if (mic_vnet_mode != VNET_MODE_DMA)
+ return micveth_create_int(num_bds, dev);
+ /* DMA mode uses micveth_create */
+ return 0;
+}
+
+void
+micveth_exit(void)
+{
+ if (mic_vnet_mode == VNET_MODE_DMA)
+ micvnet_exit();
+ else
+ micveth_exit_int();
+}
+
+int
+micveth_probe(mic_ctx_t *mic_ctx)
+{
+ if (mic_vnet_mode == VNET_MODE_DMA)
+ return micvnet_probe(mic_ctx);
+ /* No support for micveth_probe in legacy intr/poll modes */
+ return 0;
+}
+
+void
+micveth_remove(mic_ctx_t *mic_ctx)
+{
+ if (mic_vnet_mode == VNET_MODE_DMA)
+ micvnet_remove(mic_ctx);
+ /* No support for micveth_remove in legacy intr/poll modes */
+}
+
+int
+micveth_start(mic_ctx_t *mic_ctx)
+{
+ if (mic_vnet_mode == VNET_MODE_DMA)
+ return micvnet_start(mic_ctx);
+ else
+ return micveth_start_int(mic_ctx);
+}
+
+void
+micveth_stop(mic_ctx_t *mic_ctx)
+{
+ if (mic_vnet_mode == VNET_MODE_DMA)
+ micvnet_stop(mic_ctx);
+ else
+ micveth_stop_int(mic_ctx);
+}
+
+static int __init
+micveth_module_init(void)
+{
+ printk("vnet: mode: %s, buffers: %d\n",
+ mic_vnet_modes[mic_vnet_mode], vnet_num_buffers);
+
+ if (mic_vnet_mode == VNET_MODE_DMA)
+ return micvnet_module_init();
+ else
+ return micveth_module_init_int();
+}
+
+static void __exit
+micveth_module_exit(void)
+{
+ if (mic_vnet_mode == VNET_MODE_DMA)
+ micvnet_module_exit();
+ else
+ micveth_module_exit_int();
+}
+
+#ifdef HOST
+#else // card
+module_init(micveth_module_init);
+module_exit(micveth_module_exit);
+
+MODULE_LICENSE("GPL");
+#endif
--- /dev/null
+
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/list.h>
+#include <linux/circ_buf.h>
+#include <linux/reboot.h>
+#include "mic_common.h"
+#include "mic/micveth_dma.h"
+#include "mic/mic_macaddr.h"
+
+/* TODO: Clean up shutdown, let DMA's drain */
+
+#ifndef HOST
+#define SBOX_SDBIC0_DBREQ_BIT 0x80000000
+#define SBOX_MMIO_LENGTH (64 * 1024)
+#endif
+#define STOP_WAIT_TIMEOUT (4 * HZ)
+
+#ifndef HOST
+static mic_ctx_t mic_ctx_g;
+#endif
+
+struct micvnet micvnet;
+
+
+static void micvnet_send_intr(struct micvnet_info *vnet_info);
+static int micvnet_init_msg_rings(struct micvnet_info *vnet_info);
+static int micvnet_init_rx_skb_send_msg(struct micvnet_info *vnet_info);
+static void micvnet_send_add_dma_buffer_messages(struct micvnet_info *vnet_info);
+static void micvnet_stop_ws(struct work_struct *work);
+static void micvnet_start_ws(struct work_struct *work);
+int get_sbox_irq(int index);
+
+static __always_inline mic_ctx_t *
+vnet_to_ctx(struct micvnet_info *vnet_info)
+{
+ return vnet_info->mic_ctx;
+}
+
+static __always_inline void
+micvnet_wake_queue(struct micvnet_info *vnet_info)
+{
+ if (atomic_read(&vnet_info->vi_state) == MICVNET_STATE_LINKUP)
+ netif_wake_queue(vnet_info->vi_netdev);
+}
+
+static __always_inline void
+micvnet_dec_cnt_tx_pending(struct micvnet_info *vnet_info)
+{
+ if (atomic_dec_and_test(&vnet_info->cnt_tx_pending) &&
+ (atomic_read(&vnet_info->vi_state) == MICVNET_STATE_LINK_DOWN))
+ wake_up_interruptible(&vnet_info->stop_waitq);
+}
+
+
+/***********************************************************
+ Pre-allocated "list" of objects which are allocated and deallocated in FIFO
+ sequence. Allows reservation of memory at init time to prevent mem allocation
+ failures at run time. */
+static int
+list_obj_list_init(int num_obj, size_t obj_size, struct obj_list *list)
+{
+ list->size = num_obj + 1;
+ list->obj_size = obj_size;
+ list->head = list->tail = 0;
+
+ if (!(list->buf = kmalloc(list->size * list->obj_size, GFP_KERNEL))) {
+ printk(KERN_ERR "%s: list alloc failed\n", __func__);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void
+list_obj_list_deinit(struct obj_list *list)
+{
+ if (list->buf) {
+ kfree(list->buf);
+ list->buf = NULL;
+ }
+}
+
+static void *
+list_obj_alloc(struct obj_list *list)
+{
+ char *obj;
+
+ /* Remove bug_on() here to handle VNET OOO messages. In OOO conditions
+ * requests to allocate more objects than list->size are possible. */
+ if (((list->head + 1) % list->size) == list->tail) {
+ printk(KERN_ERR "%s: BUG: no free objects in obj list\n", __func__);
+ return NULL;
+ }
+
+ obj = list->buf + list->head * list->obj_size;
+ wmb();
+ list->head = (list->head + 1) % list->size;
+
+ return obj;
+}
+
+void
+list_obj_free(struct obj_list *list)
+{
+ /* Remove bug_on() here to handle VNET OOO messages */
+ if (list->tail == list->head) {
+ printk(KERN_ERR "%s: BUG: free too many list objects\n", __func__);
+ return;
+ }
+
+ list->tail = (list->tail + 1) % list->size;
+}
+
+/***********************************************************
+ * Vnet message functions
+ */
+#ifdef HOST
+static void
+micvnet_msg_rb_init(struct micvnet_msg_rb *rb)
+{
+ rb->head = rb->tail = 0;
+ rb->size = MICVNET_MSG_RB_SIZE;
+ rb->prev_head = rb->prev_tail = rb->size - 1;
+}
+
+static void
+micvnet_reset_msg_rings(struct micvnet_info *vnet_info)
+{
+ micvnet_msg_rb_init(vnet_info->vi_qp.tx);
+ micvnet_msg_rb_init(vnet_info->vi_qp.rx);
+}
+#endif
+
+static void
+micvnet_msg_rb_write_msg(struct micvnet_info *vnet_info, struct micvnet_msg *msg)
+{
+ struct micvnet_msg_rb *rb = vnet_info->vi_qp.tx;
+
+ /* The condition below should never occur under normal conditions
+ because the VNET message ring buffer size is at least 1 greater than
+ the maximum total number of outstanding messages possible in the
+ system. However, all bets are off if VNET OOO messages are
+ seen. Therefore remove the previous bug_on() here and busy wait. */
+ while (((rb->head + 1) % rb->size) == rb->tail)
+ cpu_relax();
+
+ if (!(rb->head == (rb->prev_head + 1) % rb->size))
+ printk(KERN_ERR "BUG: head not equal to prev_head + 1:\n \
+ head %d prev_head %d\n", rb->head, rb->prev_head);
+
+ smp_mb();
+#ifdef HOST
+ rb->buf[rb->head] = *msg;
+#else
+ memcpy_toio(&rb->buf[rb->head], msg, sizeof(*msg));
+#endif
+ smp_mb();
+ serializing_request(&rb->buf[rb->head]);
+
+ rb->prev_head = rb->head;
+ rb->head = (rb->head + 1) % rb->size;
+#ifndef HOST
+ rb->head = rb->head;
+#endif
+ smp_mb();
+ serializing_request(&rb->head);
+}
+
+static int
+micvnet_msg_rb_read_msg(struct micvnet_info *vnet_info, struct micvnet_msg *msg)
+{
+ struct micvnet_msg_rb *rb = vnet_info->vi_qp.rx;
+
+ if (rb->tail == rb->head)
+ return 1;
+
+ if (!(rb->tail == (rb->prev_tail + 1) % rb->size))
+ printk(KERN_ERR "BUG: tail not equal to prev_tail + 1:\n \
+ tail %d prev_tail %d\n", rb->tail, rb->prev_tail);
+
+ smp_mb();
+#ifdef HOST
+ *msg = rb->buf[rb->tail];
+#else
+ memcpy_fromio(msg, &rb->buf[rb->tail], sizeof(*msg));
+#endif
+ smp_mb();
+ serializing_request(&rb->buf[rb->tail]);
+
+ rb->prev_tail = rb->tail;
+ rb->tail = (rb->tail + 1) % rb->size;
+#ifndef HOST
+ rb->tail = rb->tail;
+#endif
+ smp_mb();
+ serializing_request(&rb->tail);
+
+ return 0;
+}
+
+void
+micvnet_msg_send_msg(struct micvnet_info *vnet_info, struct micvnet_msg *msg)
+{
+ micvnet_msg_rb_write_msg(vnet_info, msg);
+#ifdef HOST
+ if (micpm_get_reference(vnet_to_ctx(vnet_info), true))
+ return;
+#endif
+ micvnet_send_intr(vnet_info);
+#ifdef HOST
+ micpm_put_reference(vnet_to_ctx(vnet_info));
+#endif
+}
+
+static void
+micvnet_msg_send_add_dma_buffer_msg(struct micvnet_info *vnet_info,
+ struct rx_node *rnode)
+{
+ struct micvnet_msg msg;
+ struct micvnet_msg_add_dma_buffer
+ *body = &msg.body.micvnet_msg_add_dma_buffer;
+
+ msg.msg_id = MICVNET_MSG_ADD_DMA_BUFFER;
+ body->buf_phys = rnode->phys;
+ body->buf_size = rnode->size;
+ micvnet_msg_send_msg(vnet_info, &msg);
+}
+
+static void
+micvnet_msg_recv_add_dma_buffer(struct micvnet_info *vnet_info,
+ struct micvnet_msg_add_dma_buffer *msg)
+{
+ struct dma_node *dnode;
+
+ /* Remove bug_on() here to handle VNET OOO messages */
+ if (!(dnode = list_obj_alloc(&vnet_info->dnode_list)))
+ return;
+
+ dnode->phys = msg->buf_phys;
+ dnode->size = msg->buf_size;
+
+ spin_lock(&vnet_info->vi_rxlock);
+ list_add_tail(&dnode->list, &vnet_info->vi_dma_buf);
+ spin_unlock(&vnet_info->vi_rxlock);
+
+ atomic_inc(&vnet_info->cnt_dma_buf_avail);
+ micvnet_wake_queue(vnet_info);
+}
+
+static void
+micvnet_msg_send_dma_complete_msg(struct micvnet_info *vnet_info,
+ struct sched_node *snode)
+{
+ struct micvnet_msg msg;
+ struct micvnet_msg_dma_complete
+ *body = &msg.body.micvnet_msg_dma_complete;
+
+ msg.msg_id = MICVNET_MSG_DMA_COMPLETE;
+ body->dst_phys = snode->dst_phys;
+ body->size = snode->skb->len;
+ body->dma_offset = snode->dma_offset;
+ micvnet_msg_send_msg(vnet_info, &msg);
+}
+
+/* Handle an unexpected out-of-order message */
+static int
+micvnet_msg_handle_ooo_msg(struct micvnet_info *vnet_info,
+ struct micvnet_msg_dma_complete *msg)
+{
+ struct micvnet_msg_rb *rb = vnet_info->vi_qp.rx;
+ struct rx_node *rnode;
+ struct list_head *pos, *tmpl;
+ bool found = false;
+
+ rnode = list_entry((&vnet_info->vi_rx_skb)->next, struct rx_node, list);
+
+ /* Normal operation */
+ if (rnode->phys == msg->dst_phys
+ && msg->size <= (rnode->size - 3 * DMA_ALIGNMENT)
+ && msg->dma_offset < 2 * DMA_ALIGNMENT)
+ return 0;
+
+ /* Flag that weird stuff's going on */
+ printk(KERN_ERR "BUG: Unexpected vnet dma_complete message parameters:\n \
+ rnode->phys %p, msg->dst_phys %p\n \
+ rnode->size %lld, msg->size %lld, msg->dma_offset %lld\n \
+ rx rb head %d tail %d size %d\n",
+ (char *) rnode->phys, (char *) msg->dst_phys,
+ rnode->size, msg->size, msg->dma_offset,
+ rb->head, rb->tail, rb->size);
+
+ /* if message is received in order but with incorrect parameters
+ (size/dma_offset), drop it, but re-add the rnode at the back of the
+ rx_skb list, as well as at tx, similar to what is done below for ooo
+ case. */
+ if (rnode->phys == msg->dst_phys) {
+ list_del(&rnode->list);
+ list_add_tail(&rnode->list, &vnet_info->vi_rx_skb);
+ micvnet_msg_send_add_dma_buffer_msg(vnet_info, rnode);
+ vnet_info->vi_netdev->stats.rx_dropped++;
+ return 1;
+ }
+
+ /* Start of OOO message processing. First check if the message has
+ * really been received OOO. If it is completely unknown to us we just
+ * drop it and go on. */
+ list_for_each(pos, &vnet_info->vi_rx_skb) {
+ rnode = list_entry(pos, struct rx_node, list);
+ if (rnode->phys == msg->dst_phys) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ vnet_info->vi_netdev->stats.rx_dropped++;
+ return 1;
+ }
+
+ vnet_info->vi_netdev->stats.rx_errors++;
+
+ /* Skip all the rnode's till we find the one we are looking for. Rather
+ * than free rnode skb's and reallocate them, and therby risk allocation
+ * failures, we simply delete the rnode's from their current position on
+ * the rnode list and re-add them at back of the list, as well as add
+ * them back at tx. */
+ list_for_each_safe(pos, tmpl, &vnet_info->vi_rx_skb) {
+ rnode = list_entry(pos, struct rx_node, list);
+ if (rnode->phys == msg->dst_phys)
+ break;
+
+ list_del(&rnode->list);
+ list_add_tail(&rnode->list, &vnet_info->vi_rx_skb);
+ micvnet_msg_send_add_dma_buffer_msg(vnet_info, rnode);
+ }
+
+ return 0;
+}
+
+static void
+micvnet_msg_recv_dma_complete(struct micvnet_info *vnet_info,
+ struct micvnet_msg_dma_complete *msg)
+{
+ struct rx_node *rnode;
+ struct sk_buff *skb;
+
+ vnet_info->vi_netdev->stats.rx_packets++;
+
+ if (micvnet_msg_handle_ooo_msg(vnet_info, msg))
+ return;
+
+ rnode = list_entry((&vnet_info->vi_rx_skb)->next, struct rx_node, list);
+ /* Our OOO message handling guarantees that rnode->phys == msg->dst_phys */
+
+ vnet_info->vi_netdev->stats.rx_bytes += msg->size;
+ list_del(&rnode->list);
+
+ spin_lock_bh(&vnet_info->vi_txlock);
+ if (atomic_read(&vnet_info->vi_state) != MICVNET_STATE_LINKUP) {
+ spin_unlock_bh(&vnet_info->vi_txlock);
+ goto skip_adding_new_buffers;
+ }
+ atomic_inc(&vnet_info->cnt_tx_pending);
+ spin_unlock_bh(&vnet_info->vi_txlock);
+
+ /* OOM handling: check if a new SKB can be allocated. If not, we will re-add the
+ old SKB to TX and not give it to the network stack, i.e. drop it */
+ if (micvnet_init_rx_skb_send_msg(vnet_info)) {
+ list_add_tail(&rnode->list, &vnet_info->vi_rx_skb);
+ micvnet_msg_send_add_dma_buffer_msg(vnet_info, rnode);
+ micvnet_dec_cnt_tx_pending(vnet_info);
+ vnet_info->vi_netdev->stats.rx_dropped++;
+ return;
+ }
+ micvnet_dec_cnt_tx_pending(vnet_info);
+
+skip_adding_new_buffers:
+ skb = rnode->skb;
+ skb_reserve(skb, msg->dma_offset);
+ skb_put(skb, msg->size);
+ skb->dev = vnet_info->vi_netdev;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ local_bh_disable();
+ netif_receive_skb(skb);
+ local_bh_enable();
+
+#ifdef HOST
+ mic_ctx_unmap_single(vnet_to_ctx(vnet_info), rnode->phys, rnode->size);
+#endif
+ kfree(rnode);
+}
+
+static void
+micvnet_msg_send_link_down_msg(struct work_struct *work)
+{
+ struct micvnet_info *vnet_info
+ = container_of(work, struct micvnet_info, vi_ws_link_down);
+ struct micvnet_msg msg;
+ msg.msg_id = MICVNET_MSG_LINK_DOWN;
+ micvnet_msg_send_msg(vnet_info, &msg);
+}
+
+static void
+micvnet_msg_recv_msg_link_down(struct micvnet_info *vnet_info)
+{
+ atomic_set(&vnet_info->vi_state, MICVNET_STATE_BEGIN_UNINIT);
+
+ if (vnet_info->link_down_initiator)
+ wake_up_interruptible(&vnet_info->stop_waitq);
+ else
+ schedule_work(&vnet_info->vi_ws_stop);
+}
+
+static void
+micvnet_msg_send_link_up_msg(struct micvnet_info *vnet_info)
+{
+ struct micvnet_msg msg;
+ struct micvnet_msg_link_up
+ *body = &msg.body.micvnet_msg_link_up;
+
+ msg.msg_id = MICVNET_MSG_LINK_UP;
+ body->vnet_driver_version = VNET_DRIVER_VERSION;
+ micvnet_msg_send_msg(vnet_info, &msg);
+}
+
+static void
+micvnet_msg_recv_msg_link_up(struct micvnet_info *vnet_info,
+ struct micvnet_msg_link_up *msg)
+{
+ if (msg->vnet_driver_version != VNET_DRIVER_VERSION) {
+ printk(KERN_ERR "%s: Error: vnet driver version mismatch: "
+ "expected %d actual %lld\n"
+ "Ensure that host and card modules are "
+ "from the same build.\n",
+ __func__, VNET_DRIVER_VERSION,
+ msg->vnet_driver_version);
+ return;
+ }
+#ifdef HOST
+ schedule_work(&vnet_info->vi_ws_start);
+#else
+ micvnet_send_add_dma_buffer_messages(vnet_info);
+#endif
+}
+
+static void
+micvnet_msg_process_messages(struct micvnet_info *vnet_info)
+{
+ struct micvnet_msg msg;
+
+#ifdef HOST
+ micpm_get_reference(vnet_to_ctx(vnet_info), true);
+#endif
+ while (!micvnet_msg_rb_read_msg(vnet_info, &msg)) {
+ switch(msg.msg_id) {
+ case MICVNET_MSG_ADD_DMA_BUFFER:
+ micvnet_msg_recv_add_dma_buffer
+ (vnet_info,
+ &msg.body.micvnet_msg_add_dma_buffer);
+ break;
+
+ case MICVNET_MSG_DMA_COMPLETE:
+ micvnet_msg_recv_dma_complete
+ (vnet_info,
+ &msg.body.micvnet_msg_dma_complete);
+ break;
+
+ case MICVNET_MSG_LINK_DOWN:
+ micvnet_msg_recv_msg_link_down(vnet_info);
+ break;
+
+ case MICVNET_MSG_LINK_UP:
+ micvnet_msg_recv_msg_link_up(vnet_info,
+ &msg.body.micvnet_msg_link_up);
+ break;
+
+ default:
+ printk(KERN_ERR "BUG: unknown vnet msg id: %lld\n", msg.msg_id);
+ break;
+ }
+ }
+#ifdef HOST
+ micpm_put_reference(vnet_to_ctx(vnet_info));
+#endif
+}
+
+/***********************************************************
+ * Interrupts
+ */
+#ifdef HOST
+static int
+micvnet_host_doorbell_intr_handler(mic_ctx_t *mic_ctx, int doorbell)
+{
+ struct micvnet_info *vnet_info;
+ vnet_info = mic_ctx->bi_vethinfo;
+
+ queue_work(vnet_info->vi_wq, &vnet_info->vi_ws_bh);
+ return 0;
+}
+#else
+static irqreturn_t
+micvnet_host_intr_handler(int irq, void *data)
+{
+ struct micvnet_info *vnet_info = data;
+ queue_work(vnet_info->vi_wq, &vnet_info->vi_ws_bh);
+ return IRQ_HANDLED;
+}
+#endif
+
+static void
+micvnet_intr_bh_handler(struct work_struct *work)
+{
+ struct micvnet_info *vnet_info
+ = container_of(work, struct micvnet_info, vi_ws_bh);
+
+ micvnet_msg_process_messages(vnet_info);
+}
+
+#ifdef HOST
+static void
+micvnet_send_intr(struct micvnet_info *vnet_info)
+{
+ mic_ctx_t *mic_ctx = vnet_info->mic_ctx;
+ mic_send_vnet_intr(mic_ctx);
+}
+#else
+/* Ring host doorbell 3 interrupt */
+static void
+micvnet_send_intr(struct micvnet_info *vnet_info)
+{
+ uint32_t db_reg;
+
+ /* Ring host doorbell 3 interrupt */
+ db_reg = readl(vnet_info->vi_sbox + SBOX_SDBIC3)
+ | SBOX_SDBIC0_DBREQ_BIT;
+ writel(db_reg, vnet_info->vi_sbox + SBOX_SDBIC3);
+}
+#endif
+
+/***********************************************************
+ * Net device ops and rtnl link ops
+ */
+/*
+ Do nothing in ndo_open and ndo_stop. There are two reasons for this:
+ 1. Since host and card side drivers are driver pairs, if ifconfig up or
+ ifconfig down occurs on one side this needs to be communicated to the other
+ side other side otherwise in the current implementation this can bring down
+ the system. Ignoring ifconfig up or down avoids this issue.
+ 2. For now, micvnet_init is called before the dma can be initialized. However,
+ as soon as micvnet_init has been called and netdev has been created, the OS
+ can invoke .ndo_open, which however requires the DMA to have been
+ initialized. But DMA can not be initialized until later (at present after
+ the card has booted).
+ Therefore we ourselves call micvnet_start and micvnet_stop at appropriate
+ times when we are ready for them. The only consequence is all packets till
+ micvnet_start has been invoked will be dropped in ndo_start_xmit.
+ */
+
+/* Start callback */
+static int
+micvnet_start_dev(struct net_device *dev)
+{
+ struct micvnet_info *vnet_info = dev->ml_priv;
+
+ /* Stop the queue till the state becomes LINKUP. The queue will be started when
+ dma buffers are added in micvnet_msg_recv_add_dma_buffer(). Not doing this
+ results in packets getting dropped till state is LINKUP. */
+ if (atomic_read(&vnet_info->vi_state) != MICVNET_STATE_LINKUP)
+ netif_stop_queue(vnet_info->vi_netdev);
+
+ return 0;
+}
+
+/* Stop callback */
+static int
+micvnet_stop_dev(struct net_device *dev)
+{
+ return 0;
+}
+
+static void
+micvnet_dma_cb_bh(struct work_struct *work)
+{
+ struct micvnet_info
+ *vnet_info = container_of(work, struct micvnet_info, vi_ws_dmacb);
+ struct sched_node *snode;
+
+ if (!atomic_read(&vnet_info->cnt_dma_complete))
+ return;
+
+ do {
+ spin_lock_bh(&vnet_info->vi_txlock);
+ snode = list_entry((&vnet_info->vi_sched_skb)->next,
+ struct sched_node, list);
+ list_del(&snode->list);
+ spin_unlock_bh(&vnet_info->vi_txlock);
+
+ micvnet_msg_send_dma_complete_msg(vnet_info, snode);
+
+ micvnet_dec_cnt_tx_pending(vnet_info);
+#ifdef HOST
+ mic_ctx_unmap_single(vnet_to_ctx(vnet_info),
+ snode->dma_src_phys, snode->dma_size);
+ micpm_put_reference(vnet_to_ctx(vnet_info));
+#endif
+ kfree_skb(snode->skb);
+ kfree(snode);
+
+ } while (!atomic_dec_and_test(&vnet_info->cnt_dma_complete));
+}
+
+static void
+micvnet_dma_completion_callback(uint64_t data)
+{
+ struct micvnet_info *vnet_info = (struct micvnet_info *) data;
+
+ atomic_inc(&vnet_info->cnt_dma_complete);
+
+ queue_work(vnet_info->vi_wq, &vnet_info->vi_ws_dmacb);
+}
+
+static int
+micvnet_do_dma(struct micvnet_info *vnet_info, struct sched_node *snode)
+{
+ uint64_t dma_src, dma_dst;
+ int ret = 0;
+
+ dma_src = snode->dma_src_phys;
+ dma_dst = ALIGN(snode->dst_phys, DMA_ALIGNMENT);
+ snode->dma_offset = (snode->skb->data - snode->skb_data_aligned)
+ + (dma_dst - snode->dst_phys);
+ if ((ret = request_dma_channel(vnet_info->dma_chan)))
+ goto err_exit;
+
+ ret = do_dma(vnet_info->dma_chan,
+ DO_DMA_INTR,
+ dma_src,
+ dma_dst,
+ snode->dma_size,
+ &vnet_info->dma_cb);
+
+ free_dma_channel(vnet_info->dma_chan);
+
+err_exit:
+ return ret;
+}
+
+static int
+micvnet_schedule_dma(struct micvnet_info *vnet_info)
+{
+ struct tx_node *tnode;
+ struct sched_node *snode;
+ struct dma_node *dnode;
+ struct sk_buff *skb;
+ int ret = 0;
+ /* tnode */
+ spin_lock_bh(&vnet_info->vi_txlock);
+ BUG_ON(list_empty(&vnet_info->vi_tx_skb));
+ tnode = list_entry((&vnet_info->vi_tx_skb)->next,
+ struct tx_node, list);
+ list_del(&tnode->list);
+ spin_unlock_bh(&vnet_info->vi_txlock);
+ skb = tnode->skb;
+ kfree(tnode);
+
+#ifdef HOST
+ if ((ret = micpm_get_reference(vnet_to_ctx(vnet_info), true)))
+ goto err_exit_no_dec_node_refcnt;
+#endif
+
+ /* dnode */
+ spin_lock(&vnet_info->vi_rxlock);
+ BUG_ON(list_empty(&vnet_info->vi_dma_buf));
+ dnode = list_entry((&vnet_info->vi_dma_buf)->next,
+ struct dma_node, list);
+ spin_unlock(&vnet_info->vi_rxlock);
+ if (dnode->size < skb->len + 3 * DMA_ALIGNMENT) {
+ ret = -ENOMEM;
+ goto err_exit;
+ }
+
+ /* snode */
+ if (!(snode = kmalloc(sizeof(*snode), GFP_KERNEL))) {
+ ret = -ENOMEM;
+ goto err_exit;
+ }
+ snode->skb = skb;
+ snode->dst_phys = dnode->phys;
+ snode->skb_data_aligned
+ = (unsigned char *) ((uint64_t) skb->data & ~(DMA_ALIGNMENT - 1));
+ snode->dma_size
+ = ALIGN((skb->len + (skb->data - snode->skb_data_aligned)),
+ DMA_ALIGNMENT);
+#ifdef HOST
+ snode->dma_src_phys = mic_ctx_map_single(vnet_to_ctx(vnet_info),
+ snode->skb_data_aligned,
+ snode->dma_size);
+ if (mic_map_error(snode->dma_src_phys)) {
+ kfree(snode);
+ ret = -ENOMEM;
+ goto err_exit;
+ }
+#else
+ snode->dma_src_phys = virt_to_phys(snode->skb_data_aligned);
+#endif
+
+ if ((ret = micvnet_do_dma(vnet_info, snode))) {
+#ifdef HOST
+ mic_ctx_unmap_single(vnet_to_ctx(vnet_info),
+ snode->dma_src_phys, snode->dma_size);
+#endif
+ kfree(snode);
+ goto err_exit;
+ }
+
+ /* Update snode/dnode lists only after all operations have successfully
+ completed and no further errors are possible */
+ spin_lock_bh(&vnet_info->vi_txlock);
+ list_add_tail(&snode->list, &vnet_info->vi_sched_skb);
+ spin_unlock_bh(&vnet_info->vi_txlock);
+
+ spin_lock(&vnet_info->vi_rxlock);
+ list_del(&dnode->list);
+ spin_unlock(&vnet_info->vi_rxlock);
+ list_obj_free(&vnet_info->dnode_list);
+
+ vnet_info->vi_netdev->stats.tx_packets++;
+ vnet_info->vi_netdev->stats.tx_bytes += skb->len;
+
+ return ret;
+
+err_exit:
+#ifdef HOST
+ micpm_put_reference(vnet_to_ctx(vnet_info));
+err_exit_no_dec_node_refcnt:
+#endif
+ micvnet_dec_cnt_tx_pending(vnet_info);
+ atomic_inc(&vnet_info->cnt_dma_buf_avail);
+ micvnet_wake_queue(vnet_info);
+ skb->dev->stats.tx_dropped++;
+ kfree_skb(skb);
+ return ret;
+}
+
+static void
+micvnet_schedule_dmas(struct work_struct *work)
+{
+ struct micvnet_info *vnet_info
+ = container_of(work, struct micvnet_info, vi_ws_tx);
+ volatile bool tx_skb_list_empty;
+ while (1) {
+ spin_lock_bh(&vnet_info->vi_txlock);
+ tx_skb_list_empty = list_empty(&vnet_info->vi_tx_skb);
+ spin_unlock_bh(&vnet_info->vi_txlock);
+ if (tx_skb_list_empty)
+ break;
+
+ micvnet_schedule_dma(vnet_info);
+ }
+}
+
+int
+micvnet_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct micvnet_info *vnet_info = (struct micvnet_info*)dev->ml_priv;
+ struct tx_node *tnode;
+ if (!vnet_info || !atomic_read(&vnet_info->cnt_dma_buf_avail)){
+ goto err_exit;
+ }
+
+ if (!(tnode = kmalloc(sizeof(*tnode), GFP_ATOMIC)))
+ goto err_exit;
+ tnode->skb = skb;
+
+ spin_lock(&vnet_info->vi_txlock);
+ if (atomic_read(&vnet_info->vi_state) != MICVNET_STATE_LINKUP)
+ goto err_exit_unlock;
+ list_add_tail(&tnode->list, &vnet_info->vi_tx_skb);
+ atomic_inc(&vnet_info->cnt_tx_pending);
+ spin_unlock(&vnet_info->vi_txlock);
+
+ queue_work(vnet_info->vi_wq, &vnet_info->vi_ws_tx);
+
+ if (atomic_dec_and_test(&vnet_info->cnt_dma_buf_avail))
+ netif_stop_queue(vnet_info->vi_netdev);
+
+ return NETDEV_TX_OK;
+
+err_exit_unlock:
+ kfree(tnode);
+ spin_unlock(&vnet_info->vi_txlock);
+err_exit:
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0)
+static void
+micvnet_multicast_list(struct net_device *dev)
+{
+}
+#endif
+
+static int
+micvnet_set_address(struct net_device *dev, void *p)
+{
+ struct sockaddr *sa = p;
+
+ if (!is_valid_ether_addr(sa->sa_data))
+ return -EADDRNOTAVAIL;
+
+ memcpy(dev->dev_addr, sa->sa_data, ETH_ALEN);
+ return 0;
+}
+
+#define MIN_MTU 68
+#define MAX_MTU MICVNET_MAX_MTU
+
+static int
+micvnet_change_mtu(struct net_device *dev, int new_mtu)
+{
+ if (new_mtu < MIN_MTU || new_mtu > MAX_MTU)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+union serial {
+ uint32_t regs[3];
+ char string[13];
+};
+
+void
+mic_get_serial_from_dbox(struct micvnet_info *vni, char *serialnum)
+{
+ union serial serial;
+#ifdef HOST
+ serial.regs[0] = DBOX_READ(vni->mic_ctx->mmio.va, DBOX_SWF1X0);
+ serial.regs[1] = DBOX_READ(vni->mic_ctx->mmio.va, DBOX_SWF1X1);
+ serial.regs[2] = DBOX_READ(vni->mic_ctx->mmio.va, DBOX_SWF1X2);
+#else
+ serial.regs[0] = readl(vni->vi_dbox + DBOX_SWF1X0);
+ serial.regs[1] = readl(vni->vi_dbox + DBOX_SWF1X1);
+ serial.regs[2] = readl(vni->vi_dbox + DBOX_SWF1X2);
+#endif
+ serial.string[12] = '\0';
+ strcpy(serialnum, serial.string);
+}
+
+int
+micvnet_setmac_from_serial(struct net_device *dev)
+{
+ struct micvnet_info *vni = (struct micvnet_info *)dev->ml_priv;
+ char serialnum[17];
+ int err;
+
+ mic_get_serial_from_dbox(vni, serialnum);
+#ifdef HOST
+ err = mic_get_mac_from_serial(serialnum, dev->dev_addr, 1);
+#else
+ err = mic_get_mac_from_serial(serialnum, dev->dev_addr, 0);
+#endif
+ return err;
+}
+
+static const struct net_device_ops micvnet_netdev_ops = {
+ .ndo_open = micvnet_start_dev,
+ .ndo_stop = micvnet_stop_dev,
+ .ndo_start_xmit = micvnet_xmit,
+ .ndo_validate_addr = eth_validate_addr,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0)
+ .ndo_set_multicast_list = micvnet_multicast_list,
+#endif
+ .ndo_set_mac_address = micvnet_set_address,
+ .ndo_change_mtu = micvnet_change_mtu,
+};
+
+static void
+micvnet_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+
+ /* Initialize the device structure. */
+ dev->netdev_ops = &micvnet_netdev_ops;
+ dev->destructor = free_netdev;
+
+ /* Fill in device structure with ethernet-generic values. */
+ dev->mtu = MICVNET_MAX_MTU;
+ dev->flags &= ~IFF_MULTICAST;
+}
+
+static struct rtnl_link_ops micvnet_link_ops __read_mostly = {
+ .kind = "micvnet",
+ .setup = micvnet_setup,
+};
+
+/***********************************************************
+ * Vnet init/deinit
+ */
+static int
+micvnet_init_hw_regs(struct micvnet_info *vnet_info)
+{
+#ifdef HOST
+ mic_ctx_t *mic_ctx = vnet_info->mic_ctx;
+
+ vnet_info->vi_pdev = mic_ctx->bi_pdev;
+ vnet_info->vi_sbox = (uint8_t *)((unsigned long) mic_ctx->mmio.va +
+ HOST_SBOX_BASE_ADDRESS);
+ vnet_info->vi_scratch14
+ = (uint32_t *)((unsigned long)mic_ctx->mmio.va +
+ HOST_SBOX_BASE_ADDRESS + SBOX_SCRATCH14);
+#else
+ vnet_info->vi_sbox = ioremap_nocache(SBOX_BASE, SBOX_MMIO_LENGTH);
+ vnet_info->vi_dbox = ioremap_nocache(DBOX_BASE, SBOX_MMIO_LENGTH);
+ if (!vnet_info->vi_sbox) {
+ printk(KERN_ERR "%s: NULL SBOX ptr\n", __func__);
+ return -ENOMEM;
+ }
+ vnet_info->vi_scratch14
+ = (uint32_t *)(vnet_info->vi_sbox + SBOX_SCRATCH14);
+#endif
+ return 0;
+}
+
+static void
+micvnet_deinit_hw_regs(struct micvnet_info *vnet_info)
+{
+#ifndef HOST
+ iounmap(vnet_info->vi_sbox);
+ iounmap(vnet_info->vi_dbox);
+#endif
+}
+
+static int
+micvnet_init_interrupts(struct micvnet_info *vnet_info)
+{
+ mic_ctx_t *mic_ctx = vnet_info->mic_ctx;
+ int ret = 0;
+
+ spin_lock_init(&vnet_info->vi_txlock);
+ spin_lock_init(&vnet_info->vi_rxlock);
+
+ snprintf(vnet_info->vi_wqname, sizeof(vnet_info->vi_wqname),
+ "VNET WQ %d", mic_ctx->bi_id);
+
+ if (!(vnet_info->vi_wq =
+ __mic_create_singlethread_workqueue(vnet_info->vi_wqname))) {
+ printk(KERN_ERR "%s: create_singlethread_workqueue\n", __func__);
+ return -ENOMEM;
+ }
+ init_waitqueue_head(&vnet_info->stop_waitq);
+
+ INIT_WORK(&vnet_info->vi_ws_bh, micvnet_intr_bh_handler);
+ INIT_WORK(&vnet_info->vi_ws_tx, micvnet_schedule_dmas);
+ INIT_WORK(&vnet_info->vi_ws_dmacb, micvnet_dma_cb_bh);
+ INIT_WORK(&vnet_info->vi_ws_link_down, micvnet_msg_send_link_down_msg);
+ INIT_WORK(&vnet_info->vi_ws_stop, micvnet_stop_ws);
+ INIT_WORK(&vnet_info->vi_ws_start, micvnet_start_ws);
+#ifdef HOST
+ if ((ret = mic_reg_irqhandler(mic_ctx, 3, "Host DoorBell 3",
+ micvnet_host_doorbell_intr_handler))) {
+#else
+ if ((ret = request_irq(get_sbox_irq(VNET_SBOX_INT_IDX),
+ micvnet_host_intr_handler, IRQF_DISABLED,
+ "vnet intr", vnet_info))) {
+#endif
+ printk(KERN_ERR "%s: interrupt registration failed\n", __func__);
+ goto err_exit_destroy_workqueue;
+ }
+ return 0;
+
+err_exit_destroy_workqueue:
+ destroy_workqueue(vnet_info->vi_wq);
+ return ret;
+}
+
+static void
+micvnet_deinit_interrupts(struct micvnet_info *vnet_info)
+{
+#ifdef HOST
+ mic_unreg_irqhandler(vnet_info->mic_ctx, 3, "Host DoorBell 3");
+#else
+ free_irq(get_sbox_irq(VNET_SBOX_INT_IDX), vnet_info);
+#endif
+ destroy_workqueue(vnet_info->vi_wq);
+}
+
+
+static int
+micvnet_init_netdev(struct micvnet_info *vnet_info)
+{
+ struct net_device *dev_vnet;
+ int ret = 0;
+
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0))
+ if ((dev_vnet = (struct net_device *)alloc_netdev(sizeof(struct micvnet_info), "mic%d",
+ NET_NAME_UNKNOWN, micvnet_setup)) == NULL) {
+#else
+ if ((dev_vnet = (struct net_device *)alloc_netdev(sizeof(struct micvnet_info), "mic%d",
+ micvnet_setup)) == NULL) {
+#endif
+ printk(KERN_ERR "%s: alloc_netdev failed\n", __func__);
+ return -ENOMEM;
+ }
+
+ vnet_info->vi_netdev = dev_vnet;
+ dev_vnet->ml_priv = vnet_info;
+
+ if (micvnet_setmac_from_serial(dev_vnet))
+ random_ether_addr(dev_vnet->dev_addr);
+
+ dev_vnet->rtnl_link_ops = &micvnet_link_ops;
+
+ if ((ret = register_netdev(dev_vnet)) < 0) {
+ printk(KERN_ERR "%s: register_netdev failed %d\n", __func__, ret);
+ free_netdev(dev_vnet);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int
+micvnet_init_msg_rings(struct micvnet_info *vnet_info)
+{
+#ifdef HOST
+ vnet_info->vi_qp.tx = &vnet_info->vi_rp.rb_tx;
+ vnet_info->vi_qp.rx = &vnet_info->vi_rp.rb_rx;
+ micvnet_reset_msg_rings(vnet_info);
+
+ vnet_info->vi_rp_phys = mic_ctx_map_single(vnet_to_ctx(vnet_info),
+ &vnet_info->vi_rp,
+ sizeof(vnet_info->vi_rp));
+ if (mic_map_error(vnet_info->vi_rp_phys)) {
+ printk(KERN_ERR "%s: mic_map_error failed\n", __func__);
+ return -ENOMEM;
+ }
+#else
+ if (!(vnet_info->vi_rp_phys = vnet_addr)) {
+ printk(KERN_ERR "%s: null vnet_addr\n", __func__);
+ return -ENOMEM;
+ }
+ vnet_info->ring_ptr
+ = ioremap_nocache(vnet_info->vi_rp_phys,
+ sizeof(struct micvnet_msg_ring_pair));
+ if (!vnet_info->ring_ptr) {
+ printk(KERN_ERR "%s: NULL ring ptr\n", __func__);
+ return -ENOMEM;
+ }
+ vnet_info->vi_qp.tx = &vnet_info->ring_ptr->rb_rx;
+ vnet_info->vi_qp.rx = &vnet_info->ring_ptr->rb_tx;
+#endif
+ return 0;
+}
+
+static void
+micvnet_deinit_msg_rings(struct micvnet_info *vnet_info)
+{
+#ifdef HOST
+ mic_ctx_unmap_single(vnet_to_ctx(vnet_info),
+ vnet_info->vi_rp_phys, sizeof(vnet_info->vi_rp));
+#else
+ iounmap(vnet_info->ring_ptr);
+#endif
+}
+
+static int
+micvnet_init_lists(struct micvnet_info *vnet_info)
+{
+ int ret;
+ if ((ret = list_obj_list_init(VNET_MAX_SKBS, sizeof(struct dma_node),
+ &vnet_info->dnode_list)))
+ return ret;
+
+ INIT_LIST_HEAD(&vnet_info->vi_rx_skb);
+ INIT_LIST_HEAD(&vnet_info->vi_dma_buf);
+ INIT_LIST_HEAD(&vnet_info->vi_tx_skb);
+ INIT_LIST_HEAD(&vnet_info->vi_sched_skb);
+ return 0;
+}
+
+static void
+micvnet_deinit_lists(struct micvnet_info *vnet_info)
+{
+ struct list_head *pos, *tmpq;
+ struct rx_node *rnode;
+ struct tx_node *tnode;
+ struct dma_node *dnode;
+ struct sched_node *snode;
+
+ list_for_each_safe(pos, tmpq, &vnet_info->vi_rx_skb) {
+ rnode = list_entry(pos, struct rx_node, list);
+ list_del(&rnode->list);
+#ifdef HOST
+ mic_ctx_unmap_single(vnet_to_ctx(vnet_info),
+ rnode->phys, rnode->size);
+#endif
+ kfree_skb(rnode->skb);
+ kfree(rnode);
+ }
+
+ list_for_each_safe(pos, tmpq, &vnet_info->vi_dma_buf) {
+ dnode = list_entry(pos, struct dma_node, list);
+ list_del(&dnode->list);
+ list_obj_free(&vnet_info->dnode_list);
+ }
+
+ list_for_each_safe(pos, tmpq, &vnet_info->vi_tx_skb) {
+ tnode = list_entry(pos, struct tx_node, list);
+ list_del(&tnode->list);
+ kfree_skb(tnode->skb);
+ kfree(tnode);
+ }
+
+ list_for_each_safe(pos, tmpq, &vnet_info->vi_sched_skb) {
+ snode = list_entry(pos, struct sched_node, list);
+ list_del(&snode->list);
+#ifdef HOST
+ mic_ctx_unmap_single(vnet_to_ctx(vnet_info), snode->dma_src_phys,
+ snode->dma_size);
+ micpm_put_reference(vnet_to_ctx(vnet_info));
+#endif
+ kfree_skb(snode->skb);
+ kfree(snode);
+ }
+
+ list_obj_list_deinit(&vnet_info->dnode_list);
+}
+static int
+micvnet_init_dma(struct micvnet_info *vnet_info)
+{
+ mic_ctx_t *mic_ctx = vnet_info->mic_ctx;
+ int ret;
+
+ /* Note: open_dma_device must use mic_ctx->dma_handle since that is
+ used in the isr */
+#ifdef HOST
+ if (micpm_get_reference(mic_ctx, true) != 0) {
+ printk(KERN_ERR "%s: micpm_get_reference failed\n", __func__);
+ return -ENODEV;
+ }
+
+ if ((ret = open_dma_device(mic_ctx->bi_id + 1,
+ mic_ctx->mmio.va + HOST_SBOX_BASE_ADDRESS,
+ &mic_ctx->dma_handle))) {
+ printk(KERN_ERR "%s: open_dma_device failed\n", __func__);
+ micpm_put_reference(mic_ctx);
+ return ret;
+ }
+ micpm_put_reference(mic_ctx);
+#else
+ if ((ret = open_dma_device(0, 0, &mic_ctx->dma_handle))) {
+ printk(KERN_ERR "%s: open_dma_device failed\n", __func__);
+ return ret;
+ }
+#endif
+
+ vnet_info->dma_handle = mic_ctx->dma_handle;
+
+ if ((ret = allocate_dma_channel(vnet_info->dma_handle,
+ &vnet_info->dma_chan))) {
+ printk(KERN_ERR "%s: allocate_dma_channel failed\n", __func__);
+ goto err_exit_close_dma;
+ }
+ free_dma_channel(vnet_info->dma_chan);
+ vnet_info->dma_cb.dma_completion_func = micvnet_dma_completion_callback;
+ vnet_info->dma_cb.cb_cookie = (uint64_t) vnet_info;
+ atomic_set(&vnet_info->cnt_dma_complete, 0);
+ atomic_set(&vnet_info->cnt_dma_buf_avail, 0);
+ vnet_info->link_down_initiator = false;
+ atomic_set(&vnet_info->cnt_tx_pending, 0);
+ return 0;
+
+err_exit_close_dma:
+ close_dma_device(mic_ctx->bi_id + 1, &vnet_info->dma_handle);
+ return ret;
+}
+
+static void
+micvnet_deinit_dma(struct micvnet_info *vnet_info)
+{
+ mic_ctx_t *mic_ctx = vnet_info->mic_ctx;
+
+ close_dma_device(mic_ctx->bi_id + 1, &vnet_info->dma_handle);
+}
+static int
+micvnet_alloc_rx_node(struct micvnet_info *vnet_info, struct rx_node **node)
+{
+ struct rx_node *rnode;
+
+ if (!(rnode = kmalloc(sizeof(*rnode), GFP_KERNEL)))
+ return -ENOMEM;
+
+ rnode->size = vnet_info->vi_netdev->mtu + 3 * DMA_ALIGNMENT + ETH_HLEN;
+
+ if (!(rnode->skb = dev_alloc_skb(rnode->size))) {
+ kfree(rnode);
+ return -ENOMEM;
+ }
+
+#ifdef HOST
+ rnode->phys = mic_ctx_map_single(vnet_to_ctx(vnet_info),
+ rnode->skb->data, rnode->size);
+ if (mic_map_error(rnode->phys)) {
+ kfree_skb(rnode->skb);
+ kfree(rnode);
+ return -ENOMEM;
+ }
+#else
+ rnode->phys = virt_to_phys(rnode->skb->data);
+#endif
+
+ *node = rnode;
+
+ return 0;
+}
+
+static int
+micvnet_init_rx_skb_send_msg(struct micvnet_info *vnet_info)
+{
+ struct rx_node *rnode;
+ int ret = 0;
+
+ if (unlikely(ret = micvnet_alloc_rx_node(vnet_info, &rnode)))
+ return ret;
+
+ list_add_tail(&rnode->list, &vnet_info->vi_rx_skb);
+
+ micvnet_msg_send_add_dma_buffer_msg(vnet_info, rnode);
+
+ return 0;
+}
+
+static int
+micvnet_init_rx_skbs(struct micvnet_info *vnet_info)
+{
+ struct rx_node *rnode;
+ int i, ret = 0;
+
+
+ if ( (vnet_num_buffers <= 0) || (vnet_num_buffers > VNET_MAX_SKBS) )
+ vnet_num_buffers = VNET_MAX_SKBS;
+
+ for (i = 0; i < vnet_num_buffers; i++) {
+ if (unlikely(ret = micvnet_alloc_rx_node(vnet_info, &rnode)))
+ return ret;
+
+ list_add_tail(&rnode->list, &vnet_info->vi_rx_skb);
+ }
+
+ return ret;
+}
+
+static void
+micvnet_send_add_dma_buffer_messages(struct micvnet_info *vnet_info)
+{
+ struct rx_node *rnode;
+ struct list_head *pos;
+
+ list_for_each(pos, &vnet_info->vi_rx_skb) {
+ rnode = list_entry(pos, struct rx_node, list);
+ micvnet_msg_send_add_dma_buffer_msg(vnet_info, rnode);
+ }
+}
+
+static void
+micvnet_initiate_link_down(struct micvnet_info *vnet_info)
+{
+ int ret;
+ netif_tx_disable(vnet_info->vi_netdev);
+ spin_lock_bh(&vnet_info->vi_txlock);
+ atomic_set(&vnet_info->vi_state, MICVNET_STATE_LINK_DOWN);
+ spin_unlock_bh(&vnet_info->vi_txlock);
+
+ /* This wait precludes this function to be called from the context of
+ * the vnet wq thread */
+ ret = wait_event_interruptible_timeout(
+ vnet_info->stop_waitq,
+ (atomic_read(&vnet_info->cnt_tx_pending) == 0),
+ STOP_WAIT_TIMEOUT);
+ if (!ret)
+ printk(KERN_ERR "%s timeout waiting for Tx dma buffers to drain\n", __func__);
+ /* To avoid introducing a lock in micvnet_msg_send_msg() send the
+ * LINK_DOWN message from vnet wq thread context. LINK_DOWN will be the
+ * LAST message sent. */
+ queue_work(vnet_info->vi_wq, &vnet_info->vi_ws_link_down);
+}
+
+static void
+micvnet_stop_deinit(struct micvnet_info *vnet_info)
+{
+ flush_workqueue(vnet_info->vi_wq);
+ atomic_set(&vnet_info->vi_state, MICVNET_STATE_UNINITIALIZED);
+
+ micvnet_deinit_dma(vnet_info);
+ micvnet_deinit_lists(vnet_info);
+#ifdef HOST
+ micvnet_reset_msg_rings(vnet_info);
+#endif
+ atomic_dec(&micvnet.lv_active_clients);
+}
+
+int
+micvnet_probe(mic_ctx_t *mic_ctx)
+{
+ struct micvnet_info *vnet_info;
+ int ret = 0;
+
+ mic_ctx->bi_vethinfo = NULL;
+
+ if (!micvnet.created)
+ return 1;
+
+ if (!(vnet_info = kzalloc(sizeof(struct micvnet_info), GFP_KERNEL))) {
+ printk(KERN_ERR "%s: vnet_info alloc failed\n", __func__);
+ return -ENOMEM;
+ }
+
+ mic_ctx->bi_vethinfo = vnet_info;
+ vnet_info->mic_ctx = mic_ctx;
+ if ((ret = micvnet_init_hw_regs(vnet_info)))
+ goto err_exit_free_vnet_info;
+ if ((ret = micvnet_init_msg_rings(vnet_info)))
+ goto err_exit_deinit_hw_regs;
+ if ((ret = micvnet_init_interrupts(vnet_info)))
+ goto err_exit_deinit_msg_rings;
+ if ((ret = micvnet_init_netdev(vnet_info)))
+ goto err_exit_deinit_interrupts;
+
+ atomic_set(&vnet_info->vi_state, MICVNET_STATE_UNINITIALIZED);
+ return 0;
+
+err_exit_deinit_interrupts:
+ micvnet_deinit_interrupts(vnet_info);
+err_exit_deinit_msg_rings:
+ micvnet_deinit_msg_rings(vnet_info);
+err_exit_deinit_hw_regs:
+ micvnet_deinit_hw_regs(vnet_info);
+err_exit_free_vnet_info:
+ kfree(vnet_info);
+
+ return ret;
+}
+
+void
+micvnet_remove(mic_ctx_t *mic_ctx)
+{
+ struct micvnet_info
+ *vnet_info = (struct micvnet_info *) mic_ctx->bi_vethinfo;
+
+ if (!vnet_info)
+ return;
+
+ micvnet_stop(mic_ctx);
+
+ vnet_info->vi_netdev->ml_priv = NULL;
+
+ micvnet_deinit_interrupts(vnet_info);
+ micvnet_deinit_msg_rings(vnet_info);
+ micvnet_deinit_hw_regs(vnet_info);
+
+ mic_ctx->bi_vethinfo = NULL;
+
+ kfree(vnet_info);
+}
+
+int
+micvnet_execute_start(struct micvnet_info *vnet_info)
+{
+ int ret = 0;
+
+ if (!vnet_info) {
+ printk(KERN_ERR "%s: vnet_info is NULL\n", __func__);
+ return 1;
+ }
+
+ if (atomic_cmpxchg(&vnet_info->vi_state, MICVNET_STATE_UNINITIALIZED,
+ MICVNET_STATE_TRANSITIONING) != MICVNET_STATE_UNINITIALIZED) {
+ printk(KERN_ERR "%s: wrong vnet state %d\n", __func__,
+ atomic_read(&vnet_info->vi_state));
+ return 1;
+ }
+
+ if ((ret = micvnet_init_lists(vnet_info)))
+ goto err_exit;
+ if ((ret = micvnet_init_dma(vnet_info)))
+ goto err_exit_deinit_lists;
+ if ((ret = micvnet_init_rx_skbs(vnet_info))) {
+ printk(KERN_ERR "%s: micvnet_init_rx_skbs failed\n", __func__);
+ goto err_exit_deinit_dma;
+ }
+
+ memset(&vnet_info->vi_netdev->stats, 0, sizeof(vnet_info->vi_netdev->stats));
+ atomic_inc(&micvnet.lv_active_clients);
+ atomic_set(&vnet_info->vi_state, MICVNET_STATE_LINKUP);
+
+ micvnet_msg_send_link_up_msg(vnet_info);
+#ifdef HOST
+ micvnet_send_add_dma_buffer_messages(vnet_info);
+#else
+ writel(MICVNET_CARD_UP_MAGIC, vnet_info->vi_scratch14);
+ /* Card adds DMA buffers to host after receiving MICVNET_MSG_LINK_UP */
+#endif
+ return 0;
+
+err_exit_deinit_dma:
+ micvnet_deinit_dma(vnet_info);
+err_exit_deinit_lists:
+ /* RX SKB's are deallocated in micvnet_deinit_lists() */
+ micvnet_deinit_lists(vnet_info);
+err_exit:
+ atomic_set(&vnet_info->vi_state, MICVNET_STATE_UNINITIALIZED);
+ return ret;
+}
+
+static void
+micvnet_start_ws(struct work_struct *work)
+{
+ struct micvnet_info *vnet_info
+ = container_of(work, struct micvnet_info, vi_ws_start);
+
+ micvnet_execute_start(vnet_info);
+}
+
+int micvnet_start(mic_ctx_t *mic_ctx)
+{
+#ifndef HOST
+ struct micvnet_info *vnet_info = (struct micvnet_info *) mic_ctx->bi_vethinfo;
+ micvnet_execute_start(vnet_info);
+#endif
+ return 0;
+}
+
+void
+micvnet_execute_stop(struct micvnet_info *vnet_info)
+{
+ int ret;
+ if (!vnet_info)
+ return;
+
+ switch(atomic_read(&vnet_info->vi_state)) {
+ case MICVNET_STATE_LINKUP:
+ case MICVNET_STATE_BEGIN_UNINIT:
+ break;
+ default:
+ return;
+ }
+
+#ifdef HOST
+ if ((micpm_get_reference(vnet_to_ctx(vnet_info), true)) != 0)
+ goto exit;
+#endif
+ micvnet_initiate_link_down(vnet_info);
+ if (vnet_info->link_down_initiator && !(vnet_info->mic_ctx->state == MIC_SHUTDOWN && vnet_info->mic_ctx->sdbic1)){
+ ret = wait_event_interruptible_timeout(
+ vnet_info->stop_waitq,
+ (atomic_read(&vnet_info->vi_state) == MICVNET_STATE_BEGIN_UNINIT),
+ STOP_WAIT_TIMEOUT);
+ if (!ret)
+ printk(KERN_ERR "%s: timeout waiting for link down message response\n", __func__);
+ }
+
+#ifdef HOST
+ micpm_put_reference(vnet_to_ctx(vnet_info));
+exit:
+#endif
+ micvnet_stop_deinit(vnet_info);
+}
+
+void
+micvnet_stop(mic_ctx_t *mic_ctx)
+{
+ struct micvnet_info *vnet_info = (struct micvnet_info *) mic_ctx->bi_vethinfo;
+
+ vnet_info->link_down_initiator = true;
+ micvnet_execute_stop(vnet_info);
+}
+
+static void
+micvnet_stop_ws(struct work_struct *work)
+{
+ struct micvnet_info *vnet_info
+ = container_of(work, struct micvnet_info, vi_ws_stop);
+
+ vnet_info->link_down_initiator = false;
+ micvnet_execute_stop(vnet_info);
+}
+
+#if !defined(WINDOWS) && defined(HOST)
+static ssize_t
+show_vnet(struct device *dev, struct device_attribute *attr, char *buf);
+DEVICE_ATTR(vnet, S_IRUGO, show_vnet, NULL);
+
+static ssize_t
+show_vnet(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "Number of active vnet clients: %d\n",
+ atomic_read(&micvnet.lv_active_clients));
+}
+#endif
+
+int
+micvnet_init(struct device *dev)
+{
+ int ret = 0;
+
+ micvnet.created = 0;
+ atomic_set(&micvnet.lv_active_clients, 0);
+
+ if ((ret = rtnl_link_register(&micvnet_link_ops))) {
+ printk(KERN_ERR "%s: rtnl_link_register failed\n", __func__);
+ return ret;
+ }
+
+#ifdef HOST
+ if ((ret = device_create_file(dev, &dev_attr_vnet))) {
+ printk(KERN_ERR "%s: device_create_file failed\n", __func__);
+ rtnl_link_unregister(&micvnet_link_ops);
+ return ret;
+ }
+#endif
+ micvnet.created = 1;
+ return 0;
+}
+
+void
+micvnet_exit(void)
+{
+ rtnl_link_unregister(&micvnet_link_ops);
+}
+
+#ifndef HOST
+static void __exit
+_micvnet_module_exit(void)
+{
+ mic_ctx_t *mic_ctx = &mic_ctx_g;
+
+ micvnet_stop(mic_ctx);
+ micvnet_remove(mic_ctx);
+ micvnet_exit();
+}
+
+static int
+micvnet_reboot(struct notifier_block *notifier, unsigned long unused1, void *unused2)
+{
+ /* Calling _micvnet_module_exit() here will hang the uOS during shutdown in NFS
+ * root case */
+ return NOTIFY_OK;
+}
+
+static struct notifier_block micvnet_reboot_notifier = {
+ .notifier_call = micvnet_reboot,
+ .priority = 0,
+};
+
+void __exit
+micvnet_module_exit(void)
+{
+ unregister_reboot_notifier(&micvnet_reboot_notifier);
+ _micvnet_module_exit();
+}
+
+int __init
+micvnet_module_init(void)
+{
+ mic_ctx_t *mic_ctx = &mic_ctx_g;
+ int ret = 0;
+
+ if ((ret = register_reboot_notifier(&micvnet_reboot_notifier))) {
+ printk(KERN_ERR "register_reboot_notifier failed: error %d\n", ret);
+ goto err_exit;
+ }
+
+ memset(mic_ctx, 0, sizeof(*mic_ctx));
+ mic_ctx->bi_id = 0;
+
+ if ((ret = micvnet_init(NULL)))
+ goto err_exit_unregister_reboot_notifier;
+ if ((ret = micvnet_probe(mic_ctx)))
+ goto err_exit_micvnet_exit;
+ if ((ret = micvnet_start(mic_ctx)))
+ goto err_exit_micvnet_remove;
+
+ return 0;
+
+err_exit_micvnet_remove:
+ micvnet_remove(mic_ctx);
+err_exit_micvnet_exit:
+ micvnet_exit();
+err_exit_unregister_reboot_notifier:
+ unregister_reboot_notifier(&micvnet_reboot_notifier);
+err_exit:
+ printk(KERN_ERR "%s failed: error %d\n", __func__, ret);
+ return ret;
+}
+
+#ifdef STANDALONE_VNET_DMA
+module_init(micvnet_module_init);
+module_exit(micvnet_module_exit);
+#endif
+
+MODULE_LICENSE("GPL");
+#endif
--- /dev/null
+/*
+ * Copyright 2010-2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Disclaimer: The codes contained in these modules may be specific to
+ * the Intel Software Development Platform codenamed Knights Ferry,
+ * and the Intel product codenamed Knights Corner, and are not backward
+ * compatible with other Intel products. Additionally, Intel will NOT
+ * support the codes or instruction set in future products.
+ *
+ * Intel offers no warranty of any kind regarding the code. This code is
+ * licensed on an "AS IS" basis and Intel is not obligated to provide
+ * any support, assistance, installation, training, or other services
+ * of any kind. Intel is also not obligated to provide any updates,
+ * enhancements or extensions. Intel specifically disclaims any warranty
+ * of merchantability, non-infringement, fitness for any particular
+ * purpose, and any other warranty.
+ *
+ * Further, Intel disclaims all liability of any kind, including but
+ * not limited to liability for infringement of any proprietary rights,
+ * relating to the use of the code, even if Intel is notified of the
+ * possibility of such liability. Except as expressly stated in an Intel
+ * license agreement provided with this code and agreed upon with Intel,
+ * no license, express or implied, by estoppel or otherwise, to any
+ * intellectual property rights is granted herein.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/version.h>
+
+#include "mic/micveth.h"
+
+#define __VNET_MODE(u, l) #l ,
+char *mic_vnet_modes[] = { VNET_MODES };
+#undef __VNET_MODE
+
+/*
+ *KAA: not sure when this API changed, could have been in 35.
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
+#define GRRR const
+#else
+#define GRRR /* As nothing */
+#endif
+
+static int param_set_vnetmode(const char *val, GRRR struct kernel_param *kp)
+{
+ int i;
+ for (i = 0; i < sizeof(mic_vnet_modes) / sizeof(char *); i++)
+ if (!strcmp(val, mic_vnet_modes[i])) {
+ mic_vnet_mode = i;
+ return 0;
+ }
+ return -EINVAL;
+}
+
+static int param_get_vnetmode(char *buffer, GRRR struct kernel_param *kp)
+{
+ return sprintf(buffer, "%s", mic_vnet_modes[mic_vnet_mode]);
+}
+
+#define param_check_vnetmode(name, p) __param_check(name, p, int)
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
+struct kernel_param_ops param_ops_vnetmode = {
+ .set = param_set_vnetmode,
+ .get = param_get_vnetmode,
+};
+#endif /* Kernel > 2.6.36 */
+
+int mic_vnet_mode = VNET_MODE_DMA;
+module_param_named(vnet, mic_vnet_mode, vnetmode, 0400);
+#define __VNET_MODE(u, l) " " #l
+MODULE_PARM_DESC(vnet, "Vnet operating mode, one of:" VNET_MODES);
+#undef __VNET_MODE
+
+int vnet_num_buffers = VNET_MAX_SKBS;
+module_param(vnet_num_buffers, int, 0400);
+MODULE_PARM_DESC(vnet_num_buffers, "Number of buffers used by the VNET driver");
+
+ulong vnet_addr = 0;
+module_param(vnet_addr, ulong, 0400);
+MODULE_PARM_DESC(vnet_addr, "Vnet driver host ring address");
+
+