Commit | Line | Data |
---|---|---|
800f879a AT |
1 | /* |
2 | * Copyright 2010-2017 Intel Corporation. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of the GNU General Public License, version 2, | |
6 | * as published by the Free Software Foundation. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, | |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
11 | * General Public License for more details. | |
12 | * | |
13 | * Disclaimer: The codes contained in these modules may be specific to | |
14 | * the Intel Software Development Platform codenamed Knights Ferry, | |
15 | * and the Intel product codenamed Knights Corner, and are not backward | |
16 | * compatible with other Intel products. Additionally, Intel will NOT | |
17 | * support the codes or instruction set in future products. | |
18 | * | |
19 | * Intel offers no warranty of any kind regarding the code. This code is | |
20 | * licensed on an "AS IS" basis and Intel is not obligated to provide | |
21 | * any support, assistance, installation, training, or other services | |
22 | * of any kind. Intel is also not obligated to provide any updates, | |
23 | * enhancements or extensions. Intel specifically disclaims any warranty | |
24 | * of merchantability, non-infringement, fitness for any particular | |
25 | * purpose, and any other warranty. | |
26 | * | |
27 | * Further, Intel disclaims all liability of any kind, including but | |
28 | * not limited to liability for infringement of any proprietary rights, | |
29 | * relating to the use of the code, even if Intel is notified of the | |
30 | * possibility of such liability. Except as expressly stated in an Intel | |
31 | * license agreement provided with this code and agreed upon with Intel, | |
32 | * no license, express or implied, by estoppel or otherwise, to any | |
33 | * intellectual property rights is granted herein. | |
34 | */ | |
35 | ||
36 | /* | |
37 | * Definition of the public RAS Monitoring Thread interface. | |
38 | * Access to RAS features are expected from SCIF and through | |
39 | * nodes under '/sys/class/micras'. Both interfaces ends up | |
40 | * in the same code and thus present the exact same data. | |
41 | * | |
42 | * Some information that are available elsewhere through standard | |
43 | * Linux mechanism are included in this API, though things like | |
44 | * process status (/proc/<pid>/stat), cpu status (/proc/stat), | |
45 | * and memory status (/proc/vmstat) are better from the source. | |
46 | */ | |
47 | ||
48 | #ifndef _MICRAS_API_H_ | |
49 | #define _MICRAS_API_H_ 1 | |
50 | ||
51 | #ifdef __cplusplus | |
52 | extern "C" { /* C++ guard */ | |
53 | #endif | |
54 | ||
55 | /* | |
56 | ** | |
57 | ** Configuration manifests | |
58 | ** | |
59 | */ | |
60 | ||
61 | #pragma pack(push, 4) /* Windos requirement */ | |
62 | ||
63 | ||
64 | /* | |
65 | * RAS module version info: M.NP | |
66 | */ | |
67 | ||
68 | #define RAS_MAJOR "1" | |
69 | #define RAS_MINOR "0" | |
70 | #define RAS_PATCH " " | |
71 | #define RAS_VER RAS_MAJOR "." RAS_MINOR RAS_PATCH | |
72 | ||
73 | ||
74 | /* | |
75 | * RAS services in uOS kernel listens on this port for incoming queries. | |
76 | * Consumers may establish multiple connections to this port, though no | |
77 | * guarantee on connection processing order will be given. Transactions | |
78 | * on a connection will be processed and replied to in order recieved. | |
79 | */ | |
80 | ||
81 | #define MR_MON_PORT SCIF_RAS_PORT_0 | |
82 | #define MR_SCIF_MAX 32 | |
83 | ||
84 | ||
85 | /* | |
86 | * Some array max sizes. | |
87 | * These may be replaced by system wide constants | |
88 | * if they become available in the source tree. | |
89 | */ | |
90 | ||
91 | #define MR_VERS_LEN 120 /* Version string lengths */ | |
92 | #define MR_GUID_LEN 16 /* Global unique ID length (bytes) */ | |
93 | #define MR_SENO_LEN 12 /* Serial number length (bytes) */ | |
94 | #define MR_PVER_LEN 8 /* API version string length */ | |
95 | #define MR_PTAB_LEN 64 /* PM freq/volt pairs */ | |
96 | #define MR_DIES_LEN 9 /* Die temperatures */ | |
97 | #define MR_BRDS_LEN 4 /* Board temp sensors */ | |
98 | #define MR_GVND_LEN 16 /* GDDR vendor string length */ | |
99 | #define MR_CORE_LEN 62 /* Max number of CPU cores */ | |
100 | ||
101 | ||
102 | /* | |
103 | ** Transaction header for requests and responses is a fixed size | |
104 | ** record followed by an optional variable length data block. | |
105 | ** | |
106 | ** Fields usage: | |
107 | ** cmd [15] data field is error record | |
108 | ** cmd [14] response to opcode | |
109 | ** cmd [13:0] opcode | |
110 | ** len length of payload | |
111 | ** parm command parameter | |
112 | ** stamp host side cookie, performance monitoring | |
113 | ** spent processing time, performance monitoring | |
114 | ** | |
115 | ** Command codes: | |
116 | ** Codes that directly relate to cores may set the 'parm' field to a | |
117 | ** non-zero value to address one core (base 1) instead of them all. | |
118 | ** | |
119 | */ | |
120 | ||
121 | typedef struct mr_hdr { | |
122 | uint16_t cmd; /* Command field */ | |
123 | uint16_t len; /* Size of data payload */ | |
124 | uint32_t parm; /* Parameter field */ | |
125 | uint64_t stamp; /* Time stamp of 'send' (set by host) */ | |
126 | uint64_t spent; /* Time used on response (rdtsc delta) */ | |
127 | } MrHdr; | |
128 | ||
129 | #define MR_RESP (1 << 14) /* Response bit */ | |
130 | #define MR_ERROR (1 << 15) /* Error bit */ | |
131 | #define MR_OP_MASK (MR_RESP - 1) /* Opcode mask */ | |
132 | ||
133 | #define MR_REQ_HWINF 1 /* Get hardware info */ | |
134 | #define MR_REQ_VERS 2 /* Get version strings */ | |
135 | #define MR_REQ_CFREQ 3 /* Get core frequencies */ | |
136 | #define MR_SET_CFREQ 4 /* Set core frequency */ | |
137 | #define MR_REQ_CVOLT 5 /* Get core voltages */ | |
138 | #define MR_SET_CVOLT 6 /* Set core voltage */ | |
139 | #define MR_REQ_PWR 7 /* Get power metrics */ | |
140 | #define MR_REQ_PLIM 8 /* Get power limit */ | |
141 | #define MR_SET_PLIM 9 /* Set power limit */ | |
142 | #define MR_REQ_CLST 10 /* Get core list */ | |
143 | #define MR_ENB_CORE 11 /* Enable core */ | |
144 | #define MR_DIS_CORE 12 /* Disable core */ | |
145 | #define MR_REQ_GDDR 13 /* Get GDDR device info */ | |
146 | #define MR_REQ_GFREQ 14 /* Get GDDR frequencies */ | |
147 | #define MR_SET_GFREQ 15 /* Set GDDR frequency */ | |
148 | #define MR_REQ_GVOLT 16 /* Get GDDR voltages */ | |
149 | #define MR_SET_GVOLT 17 /* Set GDDR voltage */ | |
150 | #define MR_REQ_TEMP 18 /* Get board temperatures */ | |
151 | #define MR_REQ_FAN 19 /* Get fan status */ | |
152 | #define MR_SET_FAN 20 /* Set fan power */ | |
153 | #define MR_REQ_ECC 21 /* Get ECC mode */ | |
154 | #define MR_SET_ECC 22 /* Set ECC mode */ | |
155 | #define MR_REQ_TRC 23 /* Get debug trace level */ | |
156 | #define MR_SET_TRC 24 /* Set debug trace level */ | |
157 | #define MR_REQ_TRBO 25 /* Get turbo mode status */ | |
158 | #define MR_SET_TRBO 26 /* Set turbo mode status */ | |
159 | #define MR_REQ_OCLK 27 /* Get overclocking status */ | |
160 | #define MR_SET_OCLK 28 /* Set overclocking status */ | |
161 | #define MR_REQ_CUTL 29 /* Get core utilization */ | |
162 | #define MR_REQ_MEM 30 /* Get memory utilization */ | |
163 | #define MR_REQ_OS 31 /* Get OS status & process list */ | |
164 | #define MR_REQ_PROC 32 /* Get process details */ | |
165 | #define MR_REQ_THRD 33 /* Get thread details */ | |
166 | #define MR_REQ_PVER 34 /* Get API version */ | |
167 | #define MR_CMD_PKILL 35 /* Kill process */ | |
168 | #define MR_CMD_UKILL 36 /* Kill processes owned by user */ | |
169 | #define MR_GET_SMC 37 /* Get SMC register */ | |
170 | #define MR_SET_SMC 38 /* Write SMC register */ | |
171 | #define MR_REQ_PMCFG 39 /* Get PM config mode */ | |
172 | #define MR_REQ_LED 40 /* Get LED mode */ | |
173 | #define MR_SET_LED 41 /* Set LED mode */ | |
174 | #define MR_REQ_PROCHOT 42 /* Get PROC hot trigger */ | |
175 | #define MR_SET_PROCHOT 43 /* Set PROC hot trigger */ | |
176 | #define MR_REQ_GPUHOT 42 /* Get GPU hot trigger */ | |
177 | #define MR_SET_GPUHOT 43 /* Set GPU hot trigger */ | |
178 | #define MR_REQ_PWRALT 44 /* Get power alert trigger */ | |
179 | #define MR_SET_PWRALT 45 /* Set power alert trigger */ | |
180 | #define MR_REQ_PERST 46 /* Get persistent triggers flag */ | |
181 | #define MR_SET_PERST 47 /* Set persistent triggers flag */ | |
182 | #define MR_REQ_TTL 48 /* Get Throttle state */ | |
183 | #define MR_REQ_MAX 48 /* Max command code */ | |
184 | ||
185 | ||
186 | /* | |
187 | ** | |
188 | ** Transaction error record: | |
189 | ** If an error occurs during the handling of a request, an | |
190 | ** error record is returned, possibly with supplemental info. | |
191 | ** | |
192 | ** Fields usage: | |
193 | ** err code indication error condition | |
194 | ** len size of additional data | |
195 | ** | |
196 | ** For now there is no definition on what supplemental info | |
197 | ** should look like, but the idea is to open for a possibility | |
198 | ** of giving very precise specification on what the error was. | |
199 | ** Consider it a place holder for future use. | |
200 | ** | |
201 | ** Error codes: | |
202 | ** Code 'NOMEM' means that space for response generation was unavailable. | |
203 | ** Code 'NOVAL' is used to indicate that a valid request (i.e. a query | |
204 | ** on something temporarily unavailable, like processor utilization on | |
205 | ** a core in a sleep state) has no valid response. | |
206 | ** | |
207 | */ | |
208 | ||
209 | typedef struct mr_err { | |
210 | uint16_t err; /* Error code field */ | |
211 | uint16_t len; /* Length of additional error info */ | |
212 | } MrErr; | |
213 | ||
214 | #define MR_ERR_INVOP 1 /* Dofus, command/opcode invalid */ | |
215 | #define MR_ERR_INVLEN 2 /* Dofus, length not valid for opcode */ | |
216 | #define MR_ERR_INVAUX 3 /* Dofus, parm field not valid for opcode */ | |
217 | #define MR_ERR_INVDATA 4 /* Dofus, content of data block invalid */ | |
218 | #define MR_ERR_PERM 5 /* Failure, privileged command */ | |
219 | #define MR_ERR_NOMEM 6 /* Failure, out of memory */ | |
220 | #define MR_ERR_SMC 7 /* Failure, SMC communication */ | |
221 | #define MR_ERR_NOVAL 8 /* Failure, no valid value to report */ | |
222 | #define MR_ERR_UNSUP 9 /* Failure, not implemented (temporary) */ | |
223 | #define MR_ERR_RANGE 10 /* Failure, parameter out of range */ | |
224 | #define MR_ERR_PEND 11 /* Pending, internal use only */ | |
225 | ||
226 | ||
227 | /* | |
228 | ** | |
229 | ** Response container structures below. | |
230 | ** | |
231 | ** Strings are returned in Pascal format (why?), i.e. pre-fixed | |
232 | ** with a 1 byte length field and post-fixed with a 0 byte. | |
233 | ** | |
234 | */ | |
235 | ||
236 | ||
237 | /* | |
238 | * MIC Hardware Info | |
239 | * REQ_HWINF Notes: | |
240 | * - no idea how to determine PCI-E slot, it's a host side thing. | |
241 | * - assume revision is same as model ID in the component ID register | |
242 | * - unique ID not available in all flash versions | |
243 | * - Hardware version codes are reported as-is, anticipating | |
244 | * recipient to know what the codes means. | |
245 | */ | |
246 | ||
247 | typedef struct mr_rsp_hwinf { | |
248 | uint8_t guid[MR_GUID_LEN]; /* Unique ID, from SMC */ | |
249 | uint8_t board; /* Board type, SMC HW 17:16 */ | |
250 | uint8_t fab; /* Fab version, SMC HW 10:8 */ | |
251 | uint8_t sku; /* SKU #, SMC HW 2:0 */ | |
252 | uint8_t slot; /* PCI-E slot, get from where ? */ | |
253 | uint8_t rev; /* Revision, component ID 16:19 */ | |
254 | uint8_t step; /* Stepping, component ID 12:15 */ | |
255 | uint8_t substep; /* Sub-stepping, component ID 8:11 */ | |
256 | uint8_t serial[MR_SENO_LEN]; /* Serial number, from SMC */ | |
257 | } MrRspHwInf; | |
258 | ||
259 | ||
260 | ||
261 | /* | |
262 | * MIC API version | |
263 | * REQ_PVER Notes: | |
264 | * - returns RAS_VER string the module was built with. | |
265 | */ | |
266 | ||
267 | typedef struct mr_rsp_pver { | |
268 | char api[MR_PVER_LEN]; /* Ras module version */ | |
269 | } MrRspPver; | |
270 | ||
271 | ||
272 | ||
273 | /* | |
274 | * MIC uOS/Flash version | |
275 | * REQ_VERS Notes: | |
276 | * - unclear at this point what the lengths of these strings are. | |
277 | * The limit of 128 bytes is a 'best safe guess' and may change. | |
278 | * - KnF: My card has 3 flash strings, for now that's the count. | |
279 | * - KnC: Has fewer defined version strings, currently only fboot0 | |
280 | * string has been defined. | |
281 | */ | |
282 | ||
283 | typedef struct mr_rsp_vers { | |
284 | char fboot0[MR_VERS_LEN]; /* Fboot 0 version */ | |
285 | char fboot1[MR_VERS_LEN]; /* Fboot 1 version */ | |
286 | char flash[3][MR_VERS_LEN]; /* Flash block versions */ | |
287 | char uos[MR_VERS_LEN]; /* uOS kernel version */ | |
288 | char fsc[MR_VERS_LEN]; /* Fan controller version */ | |
289 | } MrRspVers; | |
290 | ||
291 | ||
292 | ||
293 | /* | |
294 | * Core frequency | |
295 | * REQ_CFREQ Notes: | |
296 | * - current is clock read from CURRENTRATIO register. | |
297 | * - default/requested clock is read from COREFREQ register. | |
298 | * In KnF, the CURRENTRATIO is not used and therefore | |
299 | * COREFREQ s reported as current speed and the default | |
300 | * is simply the first value registered (at module load). | |
301 | * - supported speeds are part of freq/voltage pairs maintained | |
302 | * by the cpu_freq driver as part of PM (cpu_freq driver). | |
303 | * - unclear if we should allow manual control (writes). | |
304 | */ | |
305 | ||
306 | typedef struct mr_rsp_freq { | |
307 | uint32_t cur; /* Actual core speed in kHz */ | |
308 | uint32_t def; /* Set core speed in kHz */ | |
309 | uint32_t slen; /* Supported count */ | |
310 | uint32_t supt[MR_PTAB_LEN]; /* Supported speed list in kHz */ | |
311 | } MrRspFreq; | |
312 | ||
313 | /* | |
314 | * Set core frequency | |
315 | * New frequency (in kHz) passed in MrHdr.parm | |
316 | * SET_CFREQ Notes: | |
317 | * - need to turn off PM for this to stick | |
318 | */ | |
319 | ||
320 | ||
321 | ||
322 | /* | |
323 | * Core voltage | |
324 | * REQ_CVOLT Notes: | |
325 | * - KnF: Two core voltages; current voltage set from COREVOLT | |
326 | * register and sense1 read in the BOARD_VOLTAGE_SENSE register. | |
327 | * - KnC: 3 potential sources; SVID, SMC, and SBOX registers. | |
328 | * SBOX regs require SMC telemetry which is uncertain. | |
329 | * SVID does not work in A0, B0 is TBD. | |
330 | * SMC will eventually relay VR data. | |
331 | * Only SVID gives both set and actual values. | |
332 | * Only SMC sets c_val field, zero is good. | |
333 | * - Supported voltages are either determined from what the VRs | |
334 | * can support or if PM is active it is part of the freq/voltage pairs | |
335 | * maintained by the cpu_freq driver as part of PM (cpu_freq driver). | |
336 | */ | |
337 | ||
338 | typedef struct mr_rsp_volt { | |
339 | uint32_t cur; /* Core voltage read in uV */ | |
340 | uint32_t set; /* Core voltage set in uV */ | |
341 | uint8_t c_val; /* Valid bits, volt read */ | |
342 | uint32_t slen; /* Supported count */ | |
343 | uint32_t supt[MR_PTAB_LEN]; /* Supported voltage list in uV */ | |
344 | } MrRspVolt; | |
345 | ||
346 | /* | |
347 | * Set core voltage | |
348 | * New voltage passed in MrHdr.parm | |
349 | * SET_CVOLT Notes: | |
350 | * - need to turn off PM for this to stick | |
351 | * - Unclear if we should allow manual control through this API. | |
352 | */ | |
353 | ||
354 | ||
355 | ||
356 | /* | |
357 | * Card power | |
358 | * REQ_PWR Notes | |
359 | * - Power status only avalable on KnC via SMC query | |
360 | * - VR status on KnC may come from VRs directly or from SMC query | |
361 | * - VR status on KnF comes from SBOX registers (telemtry) | |
362 | * - If available, status bits from query is provided, zero is good. | |
363 | */ | |
364 | ||
365 | typedef struct mr_rsp_pws { /* Power sensor status */ | |
366 | uint32_t prr; /* Current reading, in uW */ | |
367 | uint8_t p_val; /* Valid bits, power */ | |
368 | } MrRspPws; | |
369 | ||
370 | typedef struct mr_rsp_vrr { /* Voltage regulator status */ | |
371 | uint32_t pwr; /* Power reading, in uW */ | |
372 | uint32_t cur; /* Current, in uA */ | |
373 | uint32_t volt; /* Voltage, in uV */ | |
374 | uint8_t p_val; /* Valid bits, power */ | |
375 | uint8_t c_val; /* Valid bits, current */ | |
376 | uint8_t v_val; /* Valid bits, voltage */ | |
377 | } MrRspVrr; | |
378 | ||
379 | typedef struct mr_rsp_power { | |
380 | MrRspPws tot0; /* Total power, win 0 */ | |
381 | MrRspPws tot1; /* Total power, win 1 */ | |
382 | MrRspPws inst; /* Instantaneous power */ | |
383 | MrRspPws imax; /* Max instantaneous power */ | |
384 | MrRspPws pcie; /* PCI-E connector power */ | |
385 | MrRspPws c2x3; /* 2x3 connector power */ | |
386 | MrRspPws c2x4; /* 2x4 connector power */ | |
387 | MrRspVrr vccp; /* Core rail */ | |
388 | MrRspVrr vddg; /* Uncore rail */ | |
389 | MrRspVrr vddq; /* Memory subsystem rail */ | |
390 | } MrRspPower; | |
391 | ||
392 | ||
393 | ||
394 | /* | |
395 | * Power envelope | |
396 | * REQ_PLIM Notes: | |
397 | * - power envelope is a PM property. A physical limit | |
398 | * is given to PM, which then calculate derivative high | |
399 | * and low water mark figures. | |
400 | * - values are retrieved from PM module | |
401 | */ | |
402 | ||
403 | typedef struct mr_rsp_plim { | |
404 | uint32_t phys; /* Physical limit, in W */ | |
405 | uint32_t hmrk; /* High water mark, in W */ | |
406 | uint32_t lmrk; /* Low water mark, in W */ | |
407 | } MrRspPlim; | |
408 | ||
409 | /*TBD | |
410 | * Set power envelope | |
411 | * New value passed in MrHdr.parm | |
412 | * SET_PLIM Notes: | |
413 | * - not sure if setting this should be allowed at all. | |
414 | */ | |
415 | ||
416 | ||
417 | ||
418 | /* | |
419 | * Core information | |
420 | * REQ_CLST Notes: | |
421 | * - for the average user a core count is all required, since | |
422 | * logically the cores are _always_ enumerated 0 .. <n>-1. | |
423 | * Physical enumeration, such as ring stop, are not useful. | |
424 | * - perhaps this request should return the CPU bitfields from | |
425 | * the uOS of offline, online, possible, and present masks. | |
426 | * Would allow watching of PM activity. | |
427 | */ | |
428 | ||
429 | typedef struct mr_rsp_clst { | |
430 | uint16_t count; /* Cores present */ | |
431 | uint16_t thr; /* Threads per core */ | |
432 | } MrRspClst; | |
433 | ||
434 | ||
435 | /* | |
436 | * Set core enable/disable | |
437 | * Core id & set/reset value passed in MrHdr.parm | |
438 | * ENB_CORE/DIS_CORE Notes: | |
439 | * - uOS Linux does not have write access to HW config in SPI flash. | |
440 | * No way to enable/disable cores | |
441 | * - only listed here since if compatibility with FreeBSD is needed. | |
442 | */ | |
443 | ||
444 | ||
445 | ||
446 | /* | |
447 | * Memory device info | |
448 | * REQ_GDDR Notes: | |
449 | * - This is read from scratch9, i.e. provided by bootstrap. | |
450 | */ | |
451 | ||
452 | typedef struct mr_rsp_gddr { | |
453 | char dev[MR_GVND_LEN]; /* Device vendor */ | |
454 | uint16_t rev; /* Device revision */ | |
455 | uint32_t size; /* Device size, in Mbit/device */ | |
456 | uint32_t speed; /* Transactions speed, kT/sec */ | |
457 | } MrRspGddr; | |
458 | ||
459 | ||
460 | ||
461 | /* | |
462 | * GDDR frequencies | |
463 | * REQ_GFREQ Notes: | |
464 | * - current clock can be read from MEMORYFREQ register | |
465 | * - the GDDR nominal frequency is reported | |
466 | * - the supported frequency list contains values that PLLs | |
467 | * are capable of producing. Info is of limited use, since | |
468 | * there is no way to control the GDDR frequency (locked by fuses). | |
469 | */ | |
470 | ||
471 | typedef struct mr_rsp_gfreq { | |
472 | uint32_t cur; /* Current GDDR speed in kHz */ | |
473 | uint32_t def; /* Default GDDR speed in kHz */ | |
474 | uint32_t slen; /* Supported count */ | |
475 | uint32_t supt[MR_PTAB_LEN]; /* Supported speeds list in kHz */ | |
476 | } MrRspGfreq; | |
477 | ||
478 | /* | |
479 | * Set GDDR frequency | |
480 | * New frequency passed in MrHdr.parm | |
481 | * SET_GFREQ Notes: | |
482 | * - uOS cannot alter the PLLs because it requires retraining, which | |
483 | * causes loss of memory content. | |
484 | * - KnF: uOS does not have write access to SPI flash, which is required | |
485 | * to modify the GDDR frequency at next reboot. | |
486 | * - KnC: GDDR frequency is hard locked by fuses, cannot change, ever!!! | |
487 | */ | |
488 | ||
489 | ||
490 | ||
491 | /* | |
492 | * GDDR voltages | |
493 | * REQ_GVOLT Notes: | |
494 | * - KnF: Two GDDR voltages; current voltage set from MEMVOLT | |
495 | * register and sense2 from BOARD_VOLTAGE_SENSE register. | |
496 | * MEMVOLT register always returns zero, only sense2 | |
497 | * actually returns something useful in current Si. | |
498 | * - KnC: 3 potential sources; SVID, SMC, and SBOX registers. | |
499 | * SBOX regs require SMC telemetry which is uncertain. | |
500 | * SVID does not work in A0, B0 is TBD. | |
501 | * SMC will eventually relay VR data | |
502 | * Only SVID gives both set and actual values. | |
503 | * Only SMC sets c_val field, zero is good. | |
504 | * - Supported voltages reported are voltages the VRs can be programmed | |
505 | * to supply. Info is of limited use, since there is no way to control | |
506 | * the GDDR voltage (locked by fuses). | |
507 | */ | |
508 | ||
509 | typedef struct mr_rsp_gvolt { | |
510 | uint32_t cur; /* GDDR voltage read in uV */ | |
511 | uint32_t set; /* GDDR voltage set in uV */ | |
512 | uint8_t c_val; /* Valid bits, volt read */ | |
513 | uint32_t slen; /* Supported count */ | |
514 | uint32_t supt[MR_PTAB_LEN]; /* Supported voltage list in uV */ | |
515 | } MrRspGvolt; | |
516 | ||
517 | /* | |
518 | * Set GDDR voltage | |
519 | * New voltage passed in MrHdr.parm | |
520 | * SET_GVOLT Notes: | |
521 | * - uOS cannot alter the VR settings at all. Even if it could | |
522 | * then it still clash with the need to retrain and memory loss. | |
523 | * - KnF: uOS does not have write access to SPI flash, which is required | |
524 | * to modify the GDDR voltage at next reboot. | |
525 | * - KnC: GDDR voltage is hard locked by fuses, cannot change, ever!!! | |
526 | */ | |
527 | ||
528 | ||
529 | ||
530 | /* | |
531 | * Board temperatures | |
532 | * REQ_TEMP Notes: | |
533 | * - CPU die temps can be read from THERMAL_STATUS (highest | |
534 | * of several sensors) and CURRENT_DIE_TEMP registers. | |
535 | * The die sensors values do not match the status | |
536 | * value, so the conversion formula or calibration | |
537 | * needs a re-visit. | |
538 | * - If we could get at them, we could provide readings | |
539 | * from the following devices, but are they all useful? | |
540 | * Fan inlet sensor | |
541 | * Fan exhaust sensor | |
542 | * GDDR temp (one chip is measured) sensor | |
543 | * Vccp VR | |
544 | * Vddg VR | |
545 | * Vddq VR | |
546 | * - most devices report current and maximum temperatures in | |
547 | * degrees Celcius as a signed integer, 9 bits for die temp | |
548 | * and 8 bits for voltage regulators, 12 bit for sensors. | |
549 | */ | |
550 | ||
551 | typedef struct mr_rsp_tsns { | |
552 | int16_t cur; /* Current temperature, in C */ | |
553 | int8_t c_val; /* Valid bits, if available */ | |
554 | } MrRspTsns; | |
555 | ||
556 | typedef struct mr_rsp_tdie { | |
557 | int16_t cur; /* Current temperature, in C */ | |
558 | int16_t max; /* Maximum temperature, in C */ | |
559 | } MrRspTdie; | |
560 | ||
561 | typedef struct mr_rsp_temp { | |
562 | MrRspTsns die; /* Highest on-die measure */ | |
563 | MrRspTdie dies[MR_DIES_LEN]; /* All on-die measures */ | |
564 | MrRspTsns brd; /* Highest board measure */ | |
565 | MrRspTsns fin; /* Fan inlet */ | |
566 | MrRspTsns fout; /* Fan outlet */ | |
567 | MrRspTsns gddr; /* Gddr device */ | |
568 | MrRspTsns vccp; /* Vccp VR */ | |
569 | MrRspTsns vddg; /* Vddg VR */ | |
570 | MrRspTsns vddq; /* Vddq VR */ | |
571 | } MrRspTemp; | |
572 | ||
573 | ||
574 | ||
575 | /* | |
576 | * Fan speed | |
577 | * REQ_FAN Notes: | |
578 | * - fan status is reported in RPM and it's control is | |
579 | * a pulse with modulation ratio to 255, i.e. 0 is min, | |
580 | * 127 is ~50% and 255 is max. | |
581 | * - the card has logic for controlling two fans. | |
582 | * Only one is used and we only report status for one. | |
583 | */ | |
584 | ||
585 | typedef struct mr_rsp_fan { | |
586 | uint16_t rpm; /* Fan speed, rpm */ | |
587 | uint8_t pwm; /* Active PWM ratio, 0..255 */ | |
588 | uint8_t override; /* Override flag */ | |
589 | uint8_t r_val; /* Valid bits, speed */ | |
590 | uint8_t p_val; /* Valid bits, PWM */ | |
591 | } MrRspFan; | |
592 | ||
593 | /* | |
594 | * Set fan speed | |
595 | * Control is passed in MrHdr.parm (struct fits into 32 bit) | |
596 | * SET_FAN Notes: | |
597 | * - this may collide with OOB methods (such as IPMI) | |
598 | * that has priority, no guarantee this will stick. | |
599 | * - changing fan speed parameters may interfere | |
600 | * with PM in undefined ways. | |
601 | */ | |
602 | ||
603 | typedef struct mr_set_fan { | |
604 | uint8_t override; /* Override enable flag */ | |
605 | uint8_t pwm; /* Force PWM ratio, 0..255 */ | |
606 | } MrSetFan; | |
607 | ||
608 | ||
609 | ||
610 | /* | |
611 | * Error correction mode | |
612 | * REQ_ECC Notes: | |
613 | * - retrieve this info from one (any) of the gboxes. | |
614 | */ | |
615 | ||
616 | typedef struct mr_rsp_ecc { | |
617 | uint32_t enable; /* ECC mode: 1 enabled, 0 disabled */ | |
618 | } MrRspEcc; | |
619 | ||
620 | /* | |
621 | * Set error correction mode | |
622 | * New mode passed in MrHdr.parm | |
623 | * SET_ECC Notes: | |
624 | * - ECC cannot be changed on the fly by uOS, requires retraining | |
625 | * of GDDR which causes loss of memory content. | |
626 | * - uOS Linux does not have write access to HW config in SPI flash. | |
627 | * No way to change ECC enable/disable setting. | |
628 | */ | |
629 | ||
630 | ||
631 | ||
632 | /* | |
633 | * Trace level | |
634 | * REQ_TRC Notes: | |
635 | * - No idea what support this has in uOS Linux. | |
636 | */ | |
637 | ||
638 | typedef struct mr_rsp_trc { | |
639 | uint32_t lvl; /* Debug trace level */ | |
640 | } MrRspTrc; | |
641 | ||
642 | /* | |
643 | * Set trace level | |
644 | * New level passed in MrHdr.parm | |
645 | * SET_TRC Notes: | |
646 | * - No idea what this does in uOS Linux (nothing yet). | |
647 | */ | |
648 | ||
649 | ||
650 | ||
651 | /* | |
652 | * Turbo setting | |
653 | * REQ_TRBO Notes: | |
654 | * - Retrieve current actual turbo mode and state | |
655 | * - 'set' value: 1 if enabled, 0 otherwise | |
656 | * - 'state' value: 1 if active, 0 otherwise | |
657 | * - 'avail' value: 1 if TRBO supported, 0 otherwise | |
658 | */ | |
659 | ||
660 | typedef struct mr_rsp_trbo { | |
661 | uint8_t set; /* Turbo mode */ | |
662 | uint8_t state; /* Turbo state */ | |
663 | uint8_t avail; /* Turbo mode available */ | |
664 | uint8_t pad; /* Pad to 32 bit */ | |
665 | } MrRspTrbo; | |
666 | ||
667 | /* | |
668 | * Set turbo mode | |
669 | * New mode passed in MrHdr.parm | |
670 | * SET_TRB Notes: | |
671 | * - Set always allowed, but silently ignored is not available. | |
672 | */ | |
673 | ||
674 | ||
675 | ||
676 | /* | |
677 | * LED override | |
678 | * REQ_LED Notes: | |
679 | * - KnC: Retrieve current LED mode setting, 0=normal, 1=identify | |
680 | * - KnF: not implemented (error MR_ERR_UNSUP) | |
681 | */ | |
682 | ||
683 | typedef struct mr_rsp_led { | |
684 | uint32_t led; /* LED mode setting */ | |
685 | } MrRspLed; | |
686 | ||
687 | /* | |
688 | * Set LED mode | |
689 | * New mode passed in MrHdr.parm | |
690 | * SET_LED Notes: | |
691 | * - KnC: Mode values | |
692 | * 0 is normal SMC control (fast blink) | |
693 | * 1 is identify mode (2 blinks every 2 seconds) | |
694 | * - KnF: not implemented (error MR_ERR_UNSUP) | |
695 | */ | |
696 | ||
697 | ||
698 | ||
699 | /* | |
700 | * Overclocking | |
701 | * REQ_OCLK Notes: | |
702 | * - Curently no idea how to represent overclocking state | |
703 | * - Overclocking not supported, return MR_RSP_NOVAL | |
704 | */ | |
705 | ||
706 | typedef struct mr_rsp_oclk { | |
707 | uint32_t freq; /* Over clocking setting */ | |
708 | } MrRspOclk; | |
709 | ||
710 | /* | |
711 | * Set overclocking mode | |
712 | * New mode passed in MrHdr.parm | |
713 | * SET_OCLK Notes: | |
714 | * - Overclocking not supported, return MR_RSP_NOVAL | |
715 | */ | |
716 | ||
717 | ||
718 | ||
719 | /* | |
720 | * Processor utilization (OS status) | |
721 | * REQ_CUTL Notes: | |
722 | * - returned info is a simple sum of 4 logical CPUs | |
723 | * - the counter units returned are Linux kernel jiffies, | |
724 | * typically in range 1 - 10 ms, based on continous | |
725 | * counters maintained by the kernel. The number of | |
726 | * jiffies per second is reported for scaling purposes. | |
727 | * In order to get a current 'utilization' figure, the | |
728 | * host needs to query the counters at regular intervals | |
729 | * and use this formula to achieve a percentage: | |
730 | * u = ((c2 - c1) / (t2 - t1)) * 100 | |
731 | * or | |
732 | * u = ((c2 - c1) * 100) / (t2 - t1) | |
733 | * where t2 - t1 = elapsed jiffies between samples | |
734 | * c2 - c1 = usage jiffy counts between samples | |
735 | * - the listed counters does not add up to cover the | |
736 | * wall clock time exactly, sampling errors do occur. | |
737 | * - counters for iowait, irq, and softirq are not included. | |
738 | * - jiffy counters are updated by the timer tick interrupt | |
739 | * handler. It's accuracy is known to be limited, see | |
740 | * Documentation/cpu-load.txt for details. | |
741 | * - counters are reported regardless of core sleep states | |
742 | */ | |
743 | ||
744 | typedef struct mr_rsp_ccnt { | |
745 | uint64_t user; /* Normal user mode jiffies */ | |
746 | uint64_t nice; /* 'Nice' user mode jiffies */ | |
747 | uint64_t sys; /* System mode jiffies */ | |
748 | uint64_t idle; /* Idle time jiffies */ | |
749 | } MrRspCcnt; | |
750 | ||
751 | typedef struct mr_rsp_cutl { | |
752 | uint32_t tck; /* Actual jiffs/sec (scaled by 256) */ | |
753 | uint16_t core; /* Cores reported on */ | |
754 | uint16_t thr; /* Threads per core */ | |
755 | uint64_t jif; /* Jiffy counter at query time */ | |
756 | MrRspCcnt sum; /* System wide counters */ | |
757 | MrRspCcnt cpu[MR_CORE_LEN]; /* Counters per core */ | |
758 | } MrRspCutl; | |
759 | ||
760 | ||
761 | ||
762 | /* | |
763 | * Memory utilization (OS status) | |
764 | * REQ_MEM Notes: | |
765 | * - memory snapshot is obtained from kernel structs. | |
766 | * No walk of page descriptors is performed. | |
767 | * - Not all memory stats are visible (exported to) modules. | |
768 | * | |
769 | *TBD: | |
770 | * - Need clarification on what memory utilization means. | |
771 | * For now the total, free and buffer memory is reported. | |
772 | */ | |
773 | ||
774 | typedef struct mr_rsp_mem { | |
775 | uint32_t total; /* Total usable RAM in kB */ | |
776 | uint32_t free; /* Free memory in kB */ | |
777 | uint32_t bufs; /* Buffer storage in kB */ | |
778 | } MrRspMem; | |
779 | ||
780 | ||
781 | ||
782 | /* | |
783 | * Process management (OS status) | |
784 | * REQ_OS/REQ_PROC/REQ_THRD Notes: | |
785 | * - split in 3 levels of detail: | |
786 | * 1) Get set of applications (exclude kernel processes and threads) | |
787 | * 2) Get details on specified application (pid in MrHdr.parm), | |
788 | * which includes a thread pid list (up to 256 threads). | |
789 | * 3) Get details on specific thread (thread id in MrHdr.parm) | |
790 | * Opcodes 2 and 3 will, apart from thread list, mostly report the same | |
791 | * set of details. What needs monitoring (see 'man proc', section on | |
792 | * /proc/<pid>/stat and /proc/<pid>/status for what's available)? | |
793 | * - process time counters are continuous, so if any ratio between | |
794 | * the time a process/thread spends and actual wall clock time is | |
795 | * to be calculated, the same logic for dynamic display applies as | |
796 | * for the CUTL counters. I.e. a jiffy stamp is needed in the reply. | |
797 | *TBD: | |
798 | * - Introduce some sanity in time measurements. | |
799 | * - Level 3 (thread details) is not implemented (is it needed ?). | |
800 | * - Add ppid & credentials in MrRspProc? Needed to make a "top" display. | |
801 | */ | |
802 | ||
803 | typedef struct mr_rsp_os { | |
804 | uint64_t uptime; /* Seconds since OS boot */ | |
805 | uint64_t loads[3]; /* 1, 5, 15 minute load average */ | |
806 | uint32_t alen; /* Application count */ | |
807 | uint32_t apid[256]; /* Application PIDs */ | |
808 | } MrRspOs; | |
809 | ||
810 | typedef struct mr_rsp_proc { | |
811 | uint32_t pid; /* Process ID */ | |
812 | char name[16]; /* Program name (less path) */ | |
813 | uint64_t utime; /* User time in uS */ | |
814 | uint64_t stime; /* System time in uS */ | |
815 | uint64_t etime; /* Elapsed time in uS */ | |
816 | uint32_t rss; /* Resident set, in kB */ | |
817 | uint32_t vm; /* VM size, in kB */ | |
818 | uint32_t tlen; /* Thread count */ | |
819 | uint32_t tpid[256]; /* Process threads */ | |
820 | } MrRspProc; | |
821 | ||
822 | ||
823 | ||
824 | /* | |
825 | * Terminate process | |
826 | * Signal passed in MrHdr.parm bits 31:24 (see 'kill -l') | |
827 | * Process ID passed in MrHdr.parm bits 23:0 (see /proc/sys/kernel/pid_max) | |
828 | * CMD_PKILL Notes: | |
829 | * - This is specifically for MPI style cluster managers | |
830 | * who wants to rid the card of a specific process. | |
831 | * - Processes owned by users ID's less than 500 are immune to this. | |
832 | */ | |
833 | ||
834 | ||
835 | ||
836 | /* | |
837 | * Terminate user | |
838 | * Signal passed in MrHdr.parm bits 31:24 (see 'kill -l') | |
839 | * User ID passed in MrHdr.parm bits 23:0 (see /etc/login.defs). | |
840 | * CMD_UKILL Notes: | |
841 | * - This is specifically for MPI style cluster managers to | |
842 | * rid the card of processes owned by a specific user ID. | |
843 | * - User ID's below 500 will silently be ignored. | |
844 | */ | |
845 | ||
846 | ||
847 | ||
848 | /* | |
849 | * Read SMC register | |
850 | * MR_GET_SMC Notes: | |
851 | * - Both SMC and FSC devices are accessed through I2C busses, which | |
852 | * means that retrieval will be slow (order of milli seconds). | |
853 | * - KnC: allows direct access to the SMC CSRs, which can be read | |
854 | * or written in any random order. | |
855 | * SMC CSR definitions are not within the scope of this API. | |
856 | * Register number passed in MrHdr.parm bits 7:0 (8 bits). | |
857 | * SMC registers are 32 bit, except one (UUID) that is 16 byte. | |
858 | * - KnF: allows direct access to the fan speed controller (FSC) | |
859 | * status registers on board temp and power sensors. | |
860 | * The FSC execute command register every 50 mSec, which means | |
861 | * that register needs 'SET' and hold for 50 mSec before any | |
862 | * value can be returned. For telemetry data the SET is done | |
863 | * implicitly, all other has to execute a 'SET' before running | |
864 | * a 'GET' command. | |
865 | * | |
866 | FSC register definitions are not within the scope of this API. | |
867 | * All sensor data returns are 8 bit wide. | |
868 | */ | |
869 | ||
870 | typedef struct mr_rsp_smc { | |
871 | uint8_t reg; /* Register number */ | |
872 | uint16_t width; /* Valid return bytes (4 or 16) */ | |
873 | union { | |
874 | uint32_t val; /* Requested register value */ | |
875 | uint8_t uuid[16]; /* Unique identifier */ | |
876 | uint8_t serial[12]; /* Card serial number */ | |
877 | } rtn; | |
878 | } MrRspSmc; | |
879 | ||
880 | /* | |
881 | * Write SMC register | |
882 | * Register number passed in MrHdr.parm bits 31:24 (8-bit address decode). | |
883 | * Register value passed in MrHdr.parm bits 23:0 (24 bit data). | |
884 | * MR_SET_SMC Notes: | |
885 | * - Improper use of this command can cause thermal shutdown of the card. | |
886 | * - Improper use can interfere with power management. | |
887 | * - KnC: For security reasons only the following registers are writeable: | |
888 | * 20, 22 IPMI <not documented> | |
889 | * 2b, 2c, 2d, 2f, 30, 31, 32, 33 PM control parameters | |
890 | * 4b Fan Adder | |
891 | * 60 LED control | |
892 | * No SMC registers of interest are more than 16 bits wide. | |
893 | * - KnF: For security reasons only the followingregisters are writable: | |
894 | * 0 Fan 1 Speed Override | |
895 | * 1 Power Management and Control Config | |
896 | * 11 General Status command | |
897 | * Selector is 8 bits wide and only valid values are | |
898 | * 20, 21, 22, 23 Power sensors, 1s avg. | |
899 | * 30, 31, 32, 33 Power sensors, 1 sample | |
900 | * a1, a2, a3, a4, a5 Max temps | |
901 | */ | |
902 | ||
903 | ||
904 | ||
905 | /* | |
906 | * Get PM config mode | |
907 | * REQ_PMCFG notes: | |
908 | * - Return value is reported 'as-is' from the PM module. | |
909 | */ | |
910 | ||
911 | typedef struct mr_rsp_pmcfg { | |
912 | uint32_t mode; /* Current PM operation mode */ | |
913 | } MrRspPmcfg; | |
914 | ||
915 | ||
916 | ||
917 | /* | |
918 | * Read Power triggers | |
919 | * Consist of two trigger points (power,time), which can be calculated | |
920 | * from SKU at card power-on or be persistent across reboots. | |
921 | * At trigger (PROCHOT), GPU Hot gets asserted | |
922 | * At trigger (PWRALT), Power Alert gets asserted | |
923 | * | |
924 | * MR_REQ_PROCHOT, MR_REQ_PWRALT Notes: | |
925 | * - KnC: Read SMC registers for trigger 0 and 1 respectively. | |
926 | * GPUHOT: registers 0x2c and 0x2d | |
927 | * PWRALT: registers 0x2f and 0x30 | |
928 | * - KnF: not implemented (error MR_ERR_UNSUP) | |
929 | */ | |
930 | ||
931 | typedef struct mr_rsp_ptrig { | |
932 | uint16_t power; /* Power limit, Watt */ | |
933 | uint16_t time; /* Time windows, mSec */ | |
934 | } MrRspPtrig; | |
935 | ||
936 | /* | |
937 | * Write Power triggers | |
938 | * MR_SET_PROCHOT, MR_SET_PWRALT Notes | |
939 | * Structure MrRspPtrig passed in MrHdr.parm | |
940 | * Trigger PROCHOT.power must be higher than trigger PWRALT.power. | |
941 | * - KnC: Write SMC registers for trigger 0 and 1 respectively. | |
942 | * GPUHOT: registers 0x2c and 0x2d | |
943 | * PWRALT: registers 0x2f and 0x30 | |
944 | * - KnF: not implemented (error MR_ERR_UNSUP) | |
945 | * Warning: MT does not check for GPUHOT.power >= PWRALT.power. | |
946 | *TBD: Should it? | |
947 | * It is anticipated that changes follows reads, i.e. checking | |
948 | * can be checked in application software. | |
949 | */ | |
950 | ||
951 | ||
952 | ||
953 | /* | |
954 | * Read Persistent Power triggers flag | |
955 | * If set, changes to Power Triggers will be permanent | |
956 | * MR_REQ_PERST Notes: | |
957 | * - KnC: Reads bit 0 of SMC register 0x32 | |
958 | * - KnF: not implemented (error MR_ERR_UNSUP) | |
959 | */ | |
960 | ||
961 | typedef struct mr_rsp_perst { | |
962 | uint32_t perst; /* Persistent power triggers */ | |
963 | } MrRspPerst; | |
964 | ||
965 | /* | |
966 | * Write Persistent Power triggers flag | |
967 | * New value passed in MrHdr.parm | |
968 | * MR_SET_PERST Notes: | |
969 | * - KnC: Writes bit 0 of SMC register 0x32 | |
970 | * - KnF: not implemented (error MR_ERR_UNSUP) | |
971 | */ | |
972 | ||
973 | ||
974 | /* | |
975 | * Read Throttle states | |
976 | * Returns status of current and previous throttle state | |
977 | * retrieved from the card side PM module. | |
978 | * MR_REQ_TTL Notes: | |
979 | * - KnC: Calls PM for latest information. | |
980 | * Note that the 'active' flags can toggle very often, | |
981 | * which may make it less informative for display. | |
982 | * Time tracked in jiffies, not true mSec resolution. | |
983 | * - KnF: not implemented (error MR_ERR_UNSUP) | |
984 | */ | |
985 | ||
986 | typedef struct mr_rsp_tstat { | |
987 | uint8_t active; /* Currently active */ | |
988 | uint32_t since; /* Length of current throttle, mSec */ | |
989 | uint32_t count; /* Number of throttles */ | |
990 | uint32_t time; /* Total time throttled, mSec */ | |
991 | } MrRspTstat; | |
992 | ||
993 | typedef struct mr_rsp_ttl { | |
994 | MrRspTstat thermal; /* Thermal throttle state */ | |
995 | MrRspTstat power; /* Power throttle state */ | |
996 | MrRspTstat alert; /* Power alert state */ | |
997 | } MrRspTtl; | |
998 | ||
999 | ||
1000 | #pragma pack(pop) /* Restore to entry conditions */ | |
1001 | ||
1002 | #ifdef __cplusplus | |
1003 | } /* C++ guard */ | |
1004 | #endif | |
1005 | ||
1006 | #endif /* Recursion block */ |