Commit | Line | Data |
---|---|---|
3144ee8a AT |
1 | /* -*- mode: c; tab-width: 4; fill-column: 78 -*- */ |
2 | /* vi: set ts=4 tw=78: */ | |
3 | ||
4 | /* | |
5 | thread_util.c, Copyright (c) 2014 Dave Odell <dmo2118@gmail.com> | |
6 | ||
7 | Permission to use, copy, modify, distribute, and sell this software and its | |
8 | documentation for any purpose is hereby granted without fee, provided that | |
9 | the above copyright notice appear in all copies and that both that | |
10 | copyright notice and this permission notice appear in supporting | |
11 | documentation. No representations are made about the suitability of this | |
12 | software for any purpose. It is provided "as is" without express or | |
13 | implied warranty. | |
14 | */ | |
15 | ||
16 | #if HAVE_CONFIG_H | |
17 | # include "config.h" | |
18 | #endif | |
19 | ||
20 | #include <assert.h> | |
21 | #include <errno.h> | |
22 | #include <limits.h> | |
23 | #include <stdlib.h> | |
24 | #include <stdio.h> /* Only used by thread_memory_alignment(). */ | |
25 | #include <string.h> | |
26 | ||
27 | #if HAVE_ALLOCA_H | |
28 | # include <alloca.h> | |
29 | #endif | |
30 | ||
31 | #if HAVE_UNISTD_H | |
32 | # include <unistd.h> | |
33 | #endif | |
34 | ||
35 | #if defined __MACH__ && defined __APPLE__ /* OS X, iOS */ | |
36 | # include <sys/sysctl.h> | |
37 | # include <inttypes.h> | |
38 | #endif | |
39 | ||
40 | #include "thread_util.h" | |
41 | ||
42 | #include "aligned_malloc.h" | |
43 | #include "resources.h" | |
44 | ||
45 | #define IS_POWER_OF_2(x) ((x) > 0 && !((x) & ((x) - 1))) | |
46 | ||
47 | /* | |
48 | arraysize(a). Also known as countof(x), XtNumber(x), NELEMS(x), LEN(x), | |
49 | NUMOF(x), ARRAY_SIZE(x), etc., since the fine folks behind C never got | |
50 | around to including this incredibly useful macro in the standard library, | |
51 | which is where it belongs. | |
52 | ||
53 | Much of the code here assumes that multiple processors in a system all use | |
54 | the same cache line size...which might be wrong on occasion. | |
55 | */ | |
56 | ||
57 | #define arraysize(a) (sizeof(a) / sizeof(*(a))) | |
58 | #define arrayend(a) ((a) + arraysize(a)) | |
59 | ||
60 | /* | |
61 | These numbers are from: | |
62 | - Linux: arch/(arch name)/include/asm/cache.h, note | |
63 | L1_CACHE_BYTES/L1_CACHE_SHIFT/SMP_CACHE_BYTES. | |
64 | - FreeBSD: sys/(sys name)/include/param.h, note | |
65 | CACHE_LINE_SHIFT/CACHE_LINE_SIZE. | |
66 | ||
67 | Preprocessor symbols come from: | |
68 | - TARGET_CPU_CPP_BUILTINS() in the GNU C preprocessor | |
69 | <http://code.ohloh.net/?s=%22TARGET_CPU_CPP_BUILTINS%22&fp=304413> | |
70 | - http://predef.sourceforge.net/ | |
71 | */ | |
72 | ||
73 | /* | |
74 | Several architectures need preprocessor symbols. | |
75 | ||
76 | Qualcomm Hexagon: 1 << 5 | |
77 | Imagination Technologies META: 1 << 6 | |
78 | OpenRISC: 16 (Linux has the cache line size as a todo.) | |
79 | Unicore: 1 << 5 | |
80 | */ | |
81 | ||
82 | #if HAVE_PTHREAD | |
83 | ||
84 | # if !HAVE_UNISTD_H | |
85 | # error unistd.h must be present whenever pthread.h is. | |
86 | # endif | |
87 | ||
88 | # if defined __MACH__ && defined __APPLE__ /* OS X, iOS */ | |
89 | # include <TargetConditionals.h> /* For TARGET_OS_IPHONE. */ | |
90 | # ifdef TARGET_OS_IPHONE | |
91 | # define _CACHE_LINE_SIZE 64 | |
92 | # endif | |
93 | # endif | |
94 | ||
95 | # if defined __FreeBSD__ && !defined _CACHE_LINE_SIZE | |
96 | # include <machine/param.h> | |
97 | # ifdef CACHE_LINE_SIZE | |
98 | # define _CACHE_LINE_SIZE CACHE_LINE_SIZE | |
99 | # endif | |
100 | # endif | |
101 | ||
102 | # if !defined _CACHE_LINE_SIZE | |
103 | # if defined __alpha || defined __alpha__ | |
104 | /* DEC Alpha */ | |
105 | # define _CACHE_LINE_SIZE 64 /* EV6 and above. EV4 and EV5 use 32 bytes. */ | |
106 | # elif defined __arm__ | |
107 | /* ARM architecture */ | |
108 | # define _CACHE_LINE_SIZE (1 << 6) | |
109 | # elif defined __AVR || defined __AVR__ | |
110 | /* Atmel AVR32 */ | |
111 | # define _CACHE_LINE_SIZE (1 << 5) | |
112 | # elif defined __bfin || defined __BFIN__ | |
113 | /* Analog Devices Blackfin */ | |
114 | # define _CACHE_LINE_SIZE (1 << 5) | |
115 | # elif defined _TMS320C6X || defined __TMS320C6X__ | |
116 | /* Texas Instruments TMS320C6x */ | |
117 | # define _CACHE_LINE_SIZE (1 << 7) /* From L2. L1 data cache line is 1 << 6. */ | |
118 | # elif defined __cris | |
119 | /* Axis Communications ETRAX CRIS */ | |
120 | # define _CACHE_LINE_SIZE 32 | |
121 | # elif defined __ia64__ || defined _IA64 | |
122 | /* Intel Itanium */ | |
123 | # define _CACHE_LINE_SIZE (1 << 7) | |
124 | # elif defined __M32R__ || defined __m32r__ | |
125 | /* Mitsubishi/Renesas M32R */ | |
126 | # define _CACHE_LINE_SIZE (1 << 4) | |
127 | # elif defined __m68k__ || defined M68000 || defined __MC68K__ | |
128 | /* Motorola 68000 */ | |
129 | # define _CACHE_LINE_SIZE (1 << 4) | |
130 | # elif defined __MICROBLAZE__ || defined __microblaze__ | |
131 | /* Xilinx MicroBlaze */ | |
132 | # define _CACHE_LINE_SIZE (1 << 5) | |
133 | # elif defined __mips__ || defined __mips || defined __MIPS__ | |
134 | /* MIPS */ | |
135 | # define _CACHE_LINE_SIZE (1 << 6) | |
136 | # elif defined __mn10300__ || defined __MN10300__ | |
137 | /* Matsushita/Panasonic MN103 */ | |
138 | # define _CACHE_LINE_SIZE 32 /* MN103E010 has 16 bytes. */ | |
139 | # elif defined __hppa || defined __hppa__ | |
140 | /* Hewlett-Packard PA-RISC */ | |
141 | # define _CACHE_LINE_SIZE 64 /* PA-RISC 2.0 uses 64 bytes, PA-RISC 1.1 uses 32. */ | |
142 | # elif defined __powerpc || defined _ARCH_PPC | |
143 | /* Power Architecture (a.k.a. PowerPC) */ | |
144 | # define _CACHE_LINE_SIZE (1 << 7) /* Linux has a list of PPC models with associated L1_CACHE_SHIFT values. */ | |
145 | # elif defined __s390__ || defined __370__ || defined __zarch__ || defined __SYSC_ZARCH__ | |
146 | /* IBM System/390 */ | |
147 | # define _CACHE_LINE_SIZE 256 | |
148 | # elif defined SUNPLUS || defined __SCORE__ || defined __score__ | |
149 | /* Sunplus S+core */ | |
150 | # define _CACHE_LINE_SIZE (1 << 4) | |
151 | # elif defined __sh__ | |
152 | /* Hitachi SuperH */ | |
153 | # define _CACHE_LINE_SIZE (1 << 5) /* SH3 and earlier used 1 << 4. */ | |
154 | # elif defined __sparc__ || defined __sparc | |
155 | /* SPARC */ | |
156 | # define _CACHE_LINE_SIZE (1 << 7) /* Linux and FreeBSD disagree as to what this should be. */ | |
157 | # elif defined __tile__ | |
158 | /* Tilera TILE series */ | |
159 | # define _CACHE_LINE_SIZE (1 << 6) /* TILEPro uses different sizes for L1 and L2. */ | |
160 | # elif defined __i386 || defined __x86_64 | |
161 | /* x86(-64) */ | |
162 | # define _CACHE_LINE_SIZE (1 << 7) | |
163 | # elif defined __xtensa__ || defined __XTENSA__ | |
164 | /* Cadence Design Systems/Tensilica Xtensa */ | |
165 | # define _CACHE_LINE_SIZE (1 << 5) /* 1 << 4 on some models. */ | |
166 | # endif | |
167 | # endif /* !defined _CACHE_LINE_SIZE */ | |
168 | ||
169 | # if defined __NetBSD__ && !defined _CACHE_LINE_SIZE | |
170 | /* | |
171 | NetBSD defines COHERENCY_UNIT to be 32 on MIPS, and 64 for all other platforms -- which is wrong. Still, this is what the kernel | |
172 | uses; if this value didn't work, the system wouldn't run. | |
173 | */ | |
174 | # include <sys/param.h> | |
175 | # ifdef COHERENCY_UNIT | |
176 | # define _CACHE_LINE_SIZE COHERENCY_UNIT | |
177 | # endif | |
178 | # endif | |
179 | ||
180 | # ifndef _CACHE_LINE_SIZE | |
181 | # define _CACHE_LINE_SIZE 256 /* Fallback cache line size. */ | |
182 | # endif | |
183 | ||
184 | static unsigned _get_cache_line_size(void) | |
185 | { | |
186 | /* | |
187 | The general idea: | |
188 | - Try to get the actual cache line size from the operating system. | |
189 | - In the interest of keeping things simple, this only checks with | |
190 | glibc and OS X. | |
191 | - A few other methods that could be added: | |
192 | - Query x86 CPUs directly with the CPUID instruction. | |
193 | - Query various ELF systems through the auxillary vector. | |
194 | (Power, Alpha, SuperH) | |
195 | - Query Linux through | |
196 | /sys/devices/system/cpu/cpu?/cache/index?/coherency_line_size | |
197 | (x86 only, AFAIK) | |
198 | - Query Linux through cache_alignment in /proc/cpuinfo | |
199 | - Query Solaris through PICL. | |
200 | - If that fails, return a value appropriate for the current CPU | |
201 | architecture. | |
202 | - Otherwise, return a sufficiently large number. | |
203 | */ | |
204 | ||
205 | /* | |
206 | sysconf(3) is not a syscall, it's a glibc call that, for cache line sizes, | |
207 | uses CPUID on x86 and returns 0 on other platforms. If it were to work on | |
208 | most other platforms, it would have to get cache information from the | |
209 | kernel, since that information is usually made available by the processor | |
210 | only in privileged mode. | |
211 | https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/i386/sysconf.c;hb=HEAD | |
212 | */ | |
213 | ||
214 | /* uClibc, newlib, dietlibc, musl, Bionic do not have this. */ | |
215 | ||
216 | # if HAVE_UNISTD_H && ( \ | |
217 | defined _SC_LEVEL1_DCACHE_LINESIZE || \ | |
218 | defined _SC_LEVEL2_CACHE_LINESIZE || \ | |
219 | defined _SC_LEVEL3_CACHE_LINESIZE || \ | |
220 | defined _SC_LEVEL4_CACHE_LINESIZE) | |
221 | { | |
222 | static const int names[] = | |
223 | { | |
224 | # ifdef _SC_LEVEL1_DCACHE_LINESIZE | |
225 | _SC_LEVEL1_DCACHE_LINESIZE, | |
226 | # endif | |
227 | # ifdef _SC_LEVEL2_CACHE_LINESIZE | |
228 | _SC_LEVEL2_CACHE_LINESIZE, | |
229 | # endif | |
230 | # ifdef _SC_LEVEL3_CACHE_LINESIZE | |
231 | _SC_LEVEL3_CACHE_LINESIZE, | |
232 | # endif | |
233 | # ifdef _SC_LEVEL4_CACHE_LINESIZE | |
234 | _SC_LEVEL4_CACHE_LINESIZE | |
235 | # endif | |
236 | }; | |
237 | ||
238 | const int *name; | |
239 | long result = 0; | |
240 | ||
241 | for(name = names; name != arrayend(names); ++name) | |
242 | { | |
243 | long sysconf_result = sysconf(*name); /* Can return -1 or 0 on | |
244 | failure. */ | |
245 | ||
246 | if(sysconf_result > result) | |
247 | result = sysconf_result; | |
248 | } | |
249 | ||
250 | if(result) | |
251 | return result; | |
252 | ||
253 | /* Currently, this fails for every platform that isn't x86. Perhaps | |
254 | future versions will support other processors? */ | |
255 | } | |
256 | # endif | |
257 | ||
258 | # if defined __MACH__ && defined __APPLE__ | |
259 | { | |
260 | uint32_t result; /* sysctl.h says that hw.cachelinesize is a | |
261 | CTLTYPE_INT. */ | |
262 | size_t size = sizeof(result); | |
263 | static const int name[] = {CTL_HW, HW_CACHELINE}; | |
264 | ||
265 | if(!sysctl((int *)name, 2, &result, &size, NULL, 0)) /* (int *) is for OS X. */ | |
266 | { | |
267 | assert(size == sizeof(result)); | |
268 | return result; | |
269 | }; | |
270 | } | |
271 | # endif | |
272 | ||
273 | /* Guess based on the CPU type. */ | |
274 | return _CACHE_LINE_SIZE; | |
275 | } | |
276 | ||
277 | const pthread_mutex_t mutex_initializer = | |
278 | # if defined _GNU_SOURCE && !defined NDEBUG | |
279 | PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP | |
280 | # else | |
281 | PTHREAD_MUTEX_INITIALIZER | |
282 | # endif | |
283 | ; | |
284 | ||
285 | const pthread_cond_t cond_initializer = PTHREAD_COND_INITIALIZER; | |
286 | ||
287 | static int _has_pthread = 0; /* Initialize when needed. */ | |
288 | static int _cache_line_size = sizeof(void *); | |
289 | ||
290 | /* This is actually the init function for various things in here. */ | |
291 | int threads_available(Display *dpy) | |
292 | { | |
293 | /* This is maybe not thread-safe, but: this should -- and generally will -- | |
294 | be called before the program launches its second thread. */ | |
295 | ||
296 | if(!_has_pthread) | |
297 | { | |
298 | # if _POSIX_THREADS | |
299 | _has_pthread = _POSIX_THREADS; | |
300 | # else | |
301 | _has_pthread = sysconf(_SC_THREADS); | |
302 | # endif | |
303 | ||
304 | if(_has_pthread >= 0) | |
305 | { | |
306 | if(get_boolean_resource(dpy, "useThreads", "Boolean")) | |
307 | { | |
308 | _cache_line_size = _get_cache_line_size(); | |
309 | assert(_cache_line_size >= sizeof(void *)); | |
310 | assert(IS_POWER_OF_2(_cache_line_size)); | |
311 | } | |
312 | else | |
313 | { | |
314 | _has_pthread = -1; | |
315 | } | |
316 | } | |
317 | } | |
318 | ||
319 | return _has_pthread; | |
320 | } | |
321 | ||
322 | #endif /* HAVE_PTHREAD */ | |
323 | ||
324 | /* | |
325 | hardware_concurrency() - | |
326 | ||
327 | Various platforms offer various statistics that look like they should be | |
328 | useful: sysconf(_SC_NPROCESSORS_ONLN) (i.e. the number of 'online' | |
329 | processors) in particular is available on many Unixes, and is frequently | |
330 | used for functions like hardware_concurrency(). But 'online' is somewhat | |
331 | ambiguous; it can mean: | |
332 | ||
333 | 1. The number of CPU cores that are not (temporarily) asleep. (e.g. Android | |
334 | can sometimes put cores to sleep if they aren't being used, and this is | |
335 | reflected in _SC_NPROCESSORS_ONLN.) | |
336 | ||
337 | 2. The maximum number of CPU cores that can be provided to this application, | |
338 | as currently set by the system administrator. (2) is the one that | |
339 | hardware_concurrency() ultimately needs. | |
340 | */ | |
341 | ||
342 | /* | |
343 | Shamelessly plagarized from Boost.Thread and Stack Overflow | |
344 | <http://stackoverflow.com/q/150355>. GNU libstdc++ has some of this too, | |
345 | see thread::hardware_concurrency() in thread.cc. | |
346 | http://gcc.gnu.org/viewcvs/gcc/trunk/libstdc%2B%2B-v3/src/c%2B%2B11/thread.cc?view=markup | |
347 | ||
348 | This might not work right on less common systems for various reasons. | |
349 | */ | |
350 | ||
351 | #if HAVE_PTHREAD | |
352 | # if defined __APPLE__ && defined __MACH__ || \ | |
353 | defined __FreeBSD__ || \ | |
354 | defined __OpenBSD__ || \ | |
355 | defined __NetBSD__ || \ | |
356 | defined __DragonFly__ || \ | |
357 | defined __minix | |
358 | ||
359 | /* | |
360 | BSD Unixes use sysctl(3) for this. | |
361 | Some BSDs also support sysconf(3) for this, but in each case this was added | |
362 | after sysctl(3). | |
363 | Linux: sysctl is present, but strongly deprecated. | |
364 | Minix uses the NetBSD userspace, so it has both this and sysconf(3). | |
365 | QNX: sysctl is present for kern.* and net.*, but it doesn't say anything | |
366 | about hw.* | |
367 | */ | |
368 | ||
369 | /* __APPLE__ without __MACH__ is OS 9 or earlier. __APPLE__ with __MACH__ is OS X. */ | |
370 | ||
371 | /* | |
372 | The usual thing to do here is for sysctl(3) to call __sysctl(2). | |
373 | http://cvsweb.netbsd.org/bsdweb.cgi/src/lib/libc/gen/sysctl.c?only_with_tag=HEAD | |
374 | http://svnweb.freebsd.org/base/head/lib/libc/gen/sysctl.c?view=markup | |
375 | */ | |
376 | ||
377 | /* | |
378 | OS X: Xcode Instruments (as of Xcode 4; Apple likes to move things like | |
379 | this around) can disable CPUs as a debugging tool. | |
380 | Instruments -> Preferences... (Command-,) -> General -> Active Processor Cores | |
381 | FreeBSD, OpenBSD: It doesn't look like CPUs can be disabled. | |
382 | NetBSD: CPUs can be disabled manually through cpuctl(8). | |
383 | */ | |
384 | ||
385 | # include <stddef.h> | |
386 | ||
387 | /* FreeBSD: sys/sysctl.h needs sys/types.h, but the one doesn't bring the | |
388 | other in automatically. */ | |
389 | # include <sys/types.h> | |
390 | # include <sys/sysctl.h> | |
391 | ||
392 | static unsigned _hardware_concurrency(void) | |
393 | { | |
394 | int count; | |
395 | size_t size = sizeof(count); | |
396 | ||
397 | # if defined __APPLE__ && defined __MACH__ | |
398 | /* Apple sez: sysctl("hw.logicalcpu") is affected by the "current power | |
399 | management mode", so use hw.logicalcpu_max. */ | |
400 | /* https://developer.apple.com/library/mac/#documentation/Darwin/Reference/ManPages/man3/sysctl.3.html */ | |
401 | if(!sysctlbyname("hw.logicalcpu_max", &count, &size, NULL, 0)) /* Preferred on more recent Darwin. */ | |
402 | { | |
403 | assert(size == sizeof(count)); | |
404 | return count; | |
405 | } | |
406 | # endif | |
407 | ||
408 | # if defined HW_NCPUONLINE | |
409 | /* NetBSD has this. */ | |
410 | { | |
411 | static const int name[] = {CTL_HW, HW_NCPUONLINE}; | |
412 | if(!sysctl(name, 2, &count, &size, NULL, 0)) | |
413 | { | |
414 | assert(size == sizeof(count)); | |
415 | return count; | |
416 | } | |
417 | } | |
418 | # endif | |
419 | ||
420 | { | |
421 | static const int name[] = {CTL_HW, HW_NCPU}; | |
422 | if(!sysctl((int *)name, 2, &count, &size, NULL, 0)) /* (int *) is for OS X. */ | |
423 | { | |
424 | assert(size == sizeof(count)); | |
425 | return count; | |
426 | } | |
427 | } | |
428 | ||
429 | return 1; | |
430 | } | |
431 | ||
432 | # elif HAVE_UNISTD_H && defined _SC_NPROCESSORS_ONLN | |
433 | ||
434 | /* | |
435 | Supported by: | |
436 | Linux 2.0 was the first version to provide SMP support via clone(2). | |
437 | (e)glibc on Linux provides this, which in turn uses get_nprocs(). | |
438 | get_nprocs in turn uses /sys/devices/system/cpu/online, /proc/stat, or /proc/cpuinfo, whichever's available. | |
439 | https://sourceware.org/git/?p=glibc.git;a=blob;f=posix/sysconf.c;hb=HEAD | |
440 | https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/getsysstats.c;hb=HEAD | |
441 | Linux usually isn't configured to auto-enable/disable cores. | |
442 | SunOS (Solaris), sometime between 4.1.3 and 5.5.1. | |
443 | This includes all open source derivatives of 5.10. (Illumos, OpenIndiana) | |
444 | sysconf(_SC_NPROCESSORS_ONLN) call _sysconfig(2). | |
445 | Not sure if CPU power management (enabled by default, see cpupm and | |
446 | cpu_deep_idle in power.conf(4)) affects this. | |
447 | psradm(1M) can bring up/down CPU cores, which affects | |
448 | sysconf(_SC_NPROCESSORS_ONLN). | |
449 | http://src.illumos.org/source/xref/illumos-gate/usr/src/lib/libc/port/gen/sysconf.c | |
450 | Minix 3.2, at the latest. (This is the first version to support SMP.) | |
451 | AIX 7.1, probably earlier. | |
452 | ||
453 | Also: | |
454 | Mac OS X apparently has this on 10.5+. | |
455 | FreeBSD 5.0, NetBSD 5.0 also have this. They both call sysctl(3). | |
456 | http://svnweb.freebsd.org/base/head/lib/libc/gen/sysconf.c?view=markup | |
457 | http://cvsweb.netbsd.org/bsdweb.cgi/src/lib/libc/gen/sysconf.c?only_with_tag=HEAD | |
458 | ||
459 | QNX has sysconf(3), but it doesn't have _SC_NPROCESSORS_*. | |
460 | */ | |
461 | ||
462 | static unsigned _hardware_concurrency(void) | |
463 | { | |
464 | long count = sysconf(_SC_NPROCESSORS_ONLN); | |
465 | return count > 0 ? count : 1; | |
466 | } | |
467 | ||
468 | # else | |
469 | ||
470 | static unsigned _hardware_concurrency(void) | |
471 | { | |
472 | return 1; /* Fallback for unknown systems. */ | |
473 | } | |
474 | ||
475 | # endif | |
476 | #endif | |
477 | ||
478 | unsigned hardware_concurrency(Display *dpy) | |
479 | { | |
480 | #if HAVE_PTHREAD | |
481 | if(threads_available(dpy) >= 0) | |
482 | return _hardware_concurrency(); | |
483 | #endif | |
484 | return 1; | |
485 | } | |
486 | ||
487 | /* thread_memory_alignment() - */ | |
488 | ||
489 | unsigned thread_memory_alignment(Display *dpy) | |
490 | { | |
491 | (void)threads_available(dpy); | |
492 | #if HAVE_PTHREAD | |
493 | return _cache_line_size; | |
494 | #else | |
495 | return sizeof(void *); | |
496 | #endif | |
497 | } | |
498 | ||
499 | /* Thread pool - */ | |
500 | ||
501 | static unsigned _threadpool_count_serial(struct threadpool *self) | |
502 | { | |
503 | #if HAVE_PTHREAD | |
504 | assert(_has_pthread); | |
505 | if(_has_pthread >= 0) | |
506 | return self->count ? 1 : 0; | |
507 | #endif | |
508 | return self->count; | |
509 | } | |
510 | ||
511 | static void _serial_destroy(struct threadpool *self) | |
512 | { | |
513 | void *thread = self->serial_threads; | |
514 | unsigned i, count = _threadpool_count_serial(self); | |
515 | ||
516 | for(i = 0; i != count; ++i) | |
517 | { | |
518 | self->thread_destroy(thread); | |
519 | thread = (char *)thread + self->thread_size; | |
520 | } | |
521 | ||
522 | free(self->serial_threads); | |
523 | } | |
524 | ||
525 | #if HAVE_PTHREAD | |
526 | ||
527 | static void _parallel_abort(struct threadpool *self) | |
528 | { | |
529 | assert(self->count > 1); | |
530 | self->count = self->parallel_unfinished + 1 /* The '+ 1' should technically be _threadpool_count_serial(self). */; | |
531 | PTHREAD_VERIFY(pthread_cond_broadcast(&self->cond)); | |
532 | } | |
533 | ||
534 | struct _parallel_startup_type | |
535 | { | |
536 | struct threadpool *parent; | |
537 | int (*thread_create)(void *self, struct threadpool *pool, unsigned id); | |
538 | int last_errno; | |
539 | }; | |
540 | ||
541 | static unsigned _threadpool_count_parallel(struct threadpool *self) | |
542 | { | |
543 | assert(_has_pthread); | |
544 | assert(self->count >= 1); | |
545 | return self->count - 1 /* The '- 1' should technically be _threadpool_count_serial(self). */; | |
546 | } | |
547 | ||
548 | static void *_start_routine(void *startup_raw); | |
549 | ||
550 | /* Tricky lock sequence: _add_next_thread unlocks on error. */ | |
551 | static void _add_next_thread(struct _parallel_startup_type *self) | |
552 | { | |
553 | assert(!self->last_errno); | |
554 | ||
555 | if(self->parent->parallel_unfinished == _threadpool_count_parallel(self->parent)) | |
556 | { | |
557 | PTHREAD_VERIFY(pthread_cond_broadcast(&self->parent->cond)); | |
558 | } | |
559 | else | |
560 | { | |
561 | pthread_t *thread = self->parent->parallel_threads + self->parent->parallel_unfinished; | |
562 | self->last_errno = pthread_create(thread, NULL, _start_routine, self); | |
563 | if(self->last_errno) | |
564 | _parallel_abort(self->parent); | |
565 | } | |
566 | } | |
567 | ||
568 | static void *_thread_free_and_unlock(struct threadpool *self, void *thread) | |
569 | { | |
570 | PTHREAD_VERIFY(pthread_mutex_unlock(&self->mutex)); | |
571 | # if !HAVE_ALLOCA | |
572 | thread_free(thread); | |
573 | # endif | |
574 | return NULL; | |
575 | } | |
576 | ||
577 | static void *_thread_destroy_and_unlock(struct threadpool *self, void *thread) | |
578 | { | |
579 | self->thread_destroy(thread); | |
580 | return _thread_free_and_unlock(self, thread); | |
581 | } | |
582 | ||
583 | /* At one point, one of the threads refused to destroy itself at the end. Why?! And why won't it happen again? */ | |
584 | ||
585 | static void *_start_routine(void *startup_raw) | |
586 | { | |
587 | struct _parallel_startup_type *startup = (struct _parallel_startup_type *)startup_raw; | |
588 | ||
589 | struct threadpool *parent = startup->parent; | |
590 | ||
591 | void *thread; | |
592 | ||
593 | PTHREAD_VERIFY(pthread_mutex_lock(&parent->mutex)); | |
594 | ++parent->parallel_unfinished; | |
595 | ||
596 | # if HAVE_ALLOCA | |
597 | /* Ideally, the thread object goes on the thread's stack. This guarantees no false sharing with other threads, and in a NUMA | |
598 | configuration, ensures that the thread object is using memory from the right node. */ | |
599 | thread = alloca(parent->thread_size); | |
600 | # else | |
601 | startup->last_errno = thread_malloc(&thread, NULL, parent->thread_size); | |
602 | if(startup->last_errno) | |
603 | { | |
604 | _parallel_abort(parent); | |
605 | PTHREAD_VERIFY(pthread_mutex_unlock(&parent->mutex)); | |
606 | return NULL; | |
607 | } | |
608 | # endif | |
609 | ||
610 | /* Setting thread affinity for threads running in lock-step can cause delays | |
611 | and jumpiness. Ideally, there would be some way to recommend (but not | |
612 | require) that a thread run on a certain core/set of cores. */ | |
613 | ||
614 | /* Neither Linux nor libnuma seem to support the concept of a preferred/ideal | |
615 | CPU for a thread/process. */ | |
616 | ||
617 | /* Untested. */ | |
618 | /* { | |
619 | cpu_set_t cpu_set; | |
620 | CPU_ZERO(&cpu_set); | |
621 | CPU_SET(&cpu_set, &parent._threads_unfinished); | |
622 | pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_set); | |
623 | } */ | |
624 | ||
625 | startup->last_errno = startup->thread_create(thread, parent, parent->parallel_unfinished); | |
626 | if(startup->last_errno) | |
627 | { | |
628 | _parallel_abort(parent); | |
629 | return _thread_free_and_unlock(parent, thread); /* Tail calls make everything better. */ | |
630 | } | |
631 | ||
632 | assert(!startup->last_errno); | |
633 | _add_next_thread(startup); /* Calls _parallel_abort() on failure. */ | |
634 | if(startup->last_errno) | |
635 | return _thread_destroy_and_unlock(parent, thread); | |
636 | ||
637 | for(;;) | |
638 | { | |
639 | for(;;) | |
640 | { | |
641 | /* | |
642 | This must come before the '.threads' check, otherwise if | |
643 | threadpool_destroy is called immediately after a run starts, then | |
644 | it's possible that not all threads would be launched for the final | |
645 | run. This can cause deadlock in conjunction with things like | |
646 | barriers. | |
647 | */ | |
648 | if(parent->parallel_pending) | |
649 | break; /* Start a run. */ | |
650 | ||
651 | if(!parent->parallel_threads) | |
652 | return _thread_destroy_and_unlock(parent, thread); /* Threads are shutting down. */ | |
653 | ||
654 | PTHREAD_VERIFY(pthread_cond_wait(&parent->cond, &parent->mutex)); | |
655 | } | |
656 | ||
657 | --parent->parallel_pending; | |
658 | if(!parent->parallel_pending) | |
659 | PTHREAD_VERIFY(pthread_cond_broadcast(&parent->cond)); | |
660 | /* All threads have started processing, other threads can finish. */ | |
661 | ||
662 | PTHREAD_VERIFY(pthread_mutex_unlock(&parent->mutex)); | |
663 | ||
664 | parent->thread_run(thread); | |
665 | ||
666 | PTHREAD_VERIFY(pthread_mutex_lock(&parent->mutex)); | |
667 | # if 0 | |
668 | if(!parent->parallel_threads) /* I don't think this is necessary anymore. */ | |
669 | break; | |
670 | # endif | |
671 | /* Don't loop around until all other threads have begun processing. */ | |
672 | ||
673 | /* I suspect it doesn't matter whether this comes before or after the threads_unfinished check. */ | |
674 | while(parent->parallel_pending) | |
675 | PTHREAD_VERIFY(pthread_cond_wait(&parent->cond, &parent->mutex)); | |
676 | ||
677 | --parent->parallel_unfinished; | |
678 | if(!parent->parallel_unfinished) | |
679 | PTHREAD_VERIFY(pthread_cond_broadcast(&parent->cond)); /* All threads done for now. */ | |
680 | } | |
681 | ||
682 | /* return _thread_destroy_and_unlock(parent, thread); */ | |
683 | } | |
684 | ||
685 | static void _unlock_and_destroy(struct threadpool *self) | |
686 | { | |
687 | pthread_t *threads; | |
688 | ||
689 | threads = self->parallel_threads; | |
690 | self->parallel_threads = NULL; | |
691 | ||
692 | if(threads) | |
693 | PTHREAD_VERIFY(pthread_cond_broadcast(&self->cond)); | |
694 | ||
695 | PTHREAD_VERIFY(pthread_mutex_unlock(&self->mutex)); | |
696 | ||
697 | if(threads) | |
698 | { | |
699 | unsigned i, count = _threadpool_count_parallel(self); | |
700 | for(i = 0; i != count; ++i) | |
701 | PTHREAD_VERIFY(pthread_join(threads[i], NULL)); | |
702 | ||
703 | free(threads); | |
704 | PTHREAD_VERIFY(pthread_cond_destroy(&self->cond)); | |
705 | PTHREAD_VERIFY(pthread_mutex_destroy(&self->mutex)); | |
706 | } | |
707 | ||
708 | _serial_destroy(self); | |
709 | } | |
710 | ||
711 | #endif /* HAVE_PTHREAD */ | |
712 | ||
713 | int threadpool_create(struct threadpool *self, const struct threadpool_class *cls, Display *dpy, unsigned count) | |
714 | { | |
715 | (void)threads_available(dpy); | |
716 | ||
717 | self->count = count; | |
718 | ||
719 | /* If threads are not present, run each "thread" in sequence on the calling | |
720 | thread. Otherwise, only run the first thread on the main thread. */ | |
721 | ||
722 | assert(cls); | |
723 | ||
724 | self->thread_size = cls->size; | |
725 | self->thread_destroy = cls->destroy; | |
726 | ||
727 | { | |
728 | void *thread; | |
729 | unsigned i, count_serial = _threadpool_count_serial(self); | |
730 | ||
731 | if(count_serial) | |
732 | { | |
733 | thread = malloc(cls->size * count_serial); | |
734 | if(!thread) | |
735 | return ENOMEM; | |
736 | } | |
737 | else | |
738 | { | |
739 | /* Might as well skip the malloc. */ | |
740 | thread = NULL; | |
741 | } | |
742 | ||
743 | self->serial_threads = thread; | |
744 | ||
745 | for(i = 0; i != count_serial; ++i) | |
746 | { | |
747 | int error = cls->create(thread, self, i); | |
748 | if(error) | |
749 | { | |
750 | self->count = i; | |
751 | _serial_destroy(self); | |
752 | return error; | |
753 | } | |
754 | ||
755 | thread = (char *)thread + self->thread_size; | |
756 | } | |
757 | } | |
758 | ||
759 | #if HAVE_PTHREAD | |
760 | assert(_has_pthread); /* _has_pthread should be either -1 or >0. */ | |
761 | if(_has_pthread >= 0) | |
762 | { | |
763 | unsigned count_parallel = _threadpool_count_parallel(self); | |
764 | self->mutex = mutex_initializer; | |
765 | self->cond = cond_initializer; | |
766 | self->parallel_pending = 0; | |
767 | self->parallel_unfinished = 0; | |
768 | if(!count_parallel) | |
769 | { | |
770 | self->parallel_threads = NULL; | |
771 | return 0; | |
772 | } | |
773 | ||
774 | self->parallel_threads = malloc(sizeof(pthread_t) * count_parallel); | |
775 | if(!self->parallel_threads) | |
776 | return ENOMEM; | |
777 | ||
778 | { | |
779 | struct _parallel_startup_type startup; | |
780 | startup.parent = self; | |
781 | startup.thread_create = cls->create; | |
782 | startup.last_errno = 0; | |
783 | ||
784 | PTHREAD_VERIFY(pthread_mutex_lock(&self->mutex)); | |
785 | _add_next_thread(&startup); | |
786 | ||
787 | if(!startup.last_errno) | |
788 | { | |
789 | while(self->parallel_unfinished != count_parallel && self->parallel_threads) | |
790 | PTHREAD_VERIFY(pthread_cond_wait(&self->cond, &self->mutex)); | |
791 | } | |
792 | ||
793 | /* This must come after the if(!startup.last_errno). */ | |
794 | if(startup.last_errno) | |
795 | { | |
796 | _unlock_and_destroy(self); | |
797 | } | |
798 | else | |
799 | { | |
800 | self->parallel_unfinished = 0; | |
801 | PTHREAD_VERIFY(pthread_mutex_unlock(&self->mutex)); | |
802 | } | |
803 | ||
804 | return startup.last_errno; | |
805 | } | |
806 | } | |
807 | #endif | |
808 | ||
809 | return 0; | |
810 | } | |
811 | ||
812 | void threadpool_destroy(struct threadpool *self) | |
813 | { | |
814 | #if HAVE_PTHREAD | |
815 | if(_has_pthread >= 0) | |
816 | { | |
817 | PTHREAD_VERIFY(pthread_mutex_lock(&self->mutex)); | |
818 | _unlock_and_destroy(self); | |
819 | return; | |
820 | } | |
821 | #endif | |
822 | ||
823 | _serial_destroy(self); | |
824 | } | |
825 | ||
826 | void threadpool_run(struct threadpool *self, void (*func)(void *)) | |
827 | { | |
828 | #if HAVE_PTHREAD | |
829 | if(_has_pthread >= 0) | |
830 | { | |
831 | unsigned count = _threadpool_count_parallel(self); | |
832 | PTHREAD_VERIFY(pthread_mutex_lock(&self->mutex)); | |
833 | ||
834 | /* Do not call threadpool_run() twice without a threadpool_wait() in the middle. */ | |
835 | assert(!self->parallel_pending); | |
836 | assert(!self->parallel_unfinished); | |
837 | ||
838 | self->parallel_pending = count; | |
839 | self->parallel_unfinished = count; | |
840 | self->thread_run = func; | |
841 | PTHREAD_VERIFY(pthread_cond_broadcast(&self->cond)); | |
842 | PTHREAD_VERIFY(pthread_mutex_unlock(&self->mutex)); | |
843 | } | |
844 | #endif | |
845 | ||
846 | /* It's perfectly valid to move this to the beginning of threadpool_wait(). */ | |
847 | { | |
848 | void *thread = self->serial_threads; | |
849 | unsigned i, count = _threadpool_count_serial(self); | |
850 | for(i = 0; i != count; ++i) | |
851 | { | |
852 | func(thread); | |
853 | thread = (char *)thread + self->thread_size; | |
854 | } | |
855 | } | |
856 | } | |
857 | ||
858 | void threadpool_wait(struct threadpool *self) | |
859 | { | |
860 | #if HAVE_PTHREAD | |
861 | if(_has_pthread >= 0) | |
862 | { | |
863 | PTHREAD_VERIFY(pthread_mutex_lock(&self->mutex)); | |
864 | while(self->parallel_unfinished) | |
865 | PTHREAD_VERIFY(pthread_cond_wait(&self->cond, &self->mutex)); | |
866 | PTHREAD_VERIFY(pthread_mutex_unlock(&self->mutex)); | |
867 | } | |
868 | #endif | |
869 | } | |
870 | ||
871 | /* io_thread - */ | |
872 | ||
873 | #if HAVE_PTHREAD | |
874 | /* Without threads at compile time, there's only stubs in thread_util.h. */ | |
875 | ||
876 | # define VERSION_CHECK(cc_major, cc_minor, req_major, req_minor) \ | |
877 | ((cc_major) > (req_major) || \ | |
878 | (cc_major) == (req_major) && (cc_minor) >= (req_minor)) | |
879 | ||
880 | # if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 7) || \ | |
881 | defined(__clang__) && \ | |
882 | (!defined(__apple_build_version__) && VERSION_CHECK(__clang_major__, __clang_minor__, 3, 1) || \ | |
883 | defined(__apple_build_version__) && VERSION_CHECK(__clang_major__, __clang_minor__, 3, 1)) || \ | |
884 | defined(__ICC) && __ICC >= 1400 | |
885 | ||
886 | /* | |
887 | Clang 3.0 has a partial implementation of GNU atomics; 3.1 rounds it out. | |
888 | http://llvm.org/viewvc/llvm-project/cfe/tags/RELEASE_30/final/include/clang/Basic/Builtins.def?view=markup | |
889 | http://llvm.org/viewvc/llvm-project/cfe/tags/RELEASE_31/final/include/clang/Basic/Builtins.def?view=markup | |
890 | ||
891 | Apple changes the Clang version to track Xcode versions; use | |
892 | __apple_build_version__ to distinguish between the two. | |
893 | ||
894 | Xcode 4.3 uses Apple LLVM 3.1, which corresponds to Clang 3.1. | |
895 | https://en.wikipedia.org/wiki/Xcode | |
896 | ||
897 | Earlier versions of Intel C++ may also support these intrinsics. | |
898 | */ | |
899 | ||
900 | #define _status_load(status) (__atomic_load_n((status), __ATOMIC_SEQ_CST)) | |
901 | #define _status_exchange(obj, desired) (__atomic_exchange_n((obj), (desired), __ATOMIC_SEQ_CST)) | |
902 | ||
903 | /* C11 atomics are around the corner, but they're not here yet for many | |
904 | systems. (Including mine.) */ | |
905 | /* | |
906 | #elif __STDC_VERSION__ >= 201112l && !defined __STDC_NO_ATOMICS__ | |
907 | ||
908 | #include <stdatomic.h> | |
909 | ||
910 | #define _status_load(status) (atomic_load((status))) | |
911 | #define _status_exchange(obj, desired) (atomic_exchange((obj), (desired))) | |
912 | */ | |
913 | ||
914 | /* Solaris profiles atomic ops on at least Solaris 10. See atomic_swap(3C) and | |
915 | membar_ops(3C). This would probably also need a snippet in configure.in. | |
916 | http://graegert.com/programming/using-atomic-operations-in-c-on-solaris-10 | |
917 | */ | |
918 | ||
919 | # else | |
920 | ||
921 | /* No atomic variables, so here's some ugly mutex-based code instead. */ | |
922 | ||
923 | /* Nothing ever destroys this mutex. */ | |
924 | pthread_mutex_t _global_mutex = PTHREAD_MUTEX_INITIALIZER; | |
925 | ||
926 | #define _lock() PTHREAD_VERIFY(pthread_mutex_lock(&_global_mutex)) | |
927 | #define _unlock() PTHREAD_VERIFY(pthread_mutex_unlock(&_global_mutex)) | |
928 | ||
929 | static enum _io_thread_status _status_load(enum _io_thread_status *status) | |
930 | { | |
931 | enum _io_thread_status result; | |
932 | _lock(); | |
933 | result = *status; | |
934 | _unlock(); | |
935 | return result; | |
936 | } | |
937 | ||
938 | static enum _io_thread_status _status_exchange(enum _io_thread_status *obj, enum _io_thread_status desired) | |
939 | { | |
940 | enum _io_thread_status result; | |
941 | _lock(); | |
942 | result = *obj; | |
943 | *obj = desired; | |
944 | _unlock(); | |
945 | return result; | |
946 | } | |
947 | ||
948 | # endif | |
949 | ||
950 | void *io_thread_create(struct io_thread *self, void *parent, void *(*start_routine)(void *), Display *dpy, unsigned stacksize) | |
951 | { | |
952 | if(threads_available(dpy) >= 0) | |
953 | { | |
954 | int error; | |
955 | pthread_attr_t attr; | |
956 | pthread_attr_t *attr_ptr = NULL; | |
957 | ||
958 | if(stacksize) | |
959 | { | |
960 | attr_ptr = &attr; | |
961 | if(pthread_attr_init(&attr)) | |
962 | return NULL; | |
963 | # if (defined _POSIX_SOURCE || defined _POSIX_C_SOURCE || defined _XOPEN_SOURCE) && !defined __GNU__ | |
964 | /* PTHREAD_STACK_MIN needs the above test. */ | |
965 | assert(stacksize >= PTHREAD_STACK_MIN); | |
966 | # endif | |
967 | PTHREAD_VERIFY(pthread_attr_setstacksize(&attr, stacksize)); | |
968 | } | |
969 | ||
970 | /* This doesn't need to be an atomic store, since pthread_create(3) | |
971 | "synchronizes memory with respect to other threads". | |
972 | http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_11 */ | |
973 | self->status = _io_thread_working; | |
974 | ||
975 | error = pthread_create(&self->thread, attr_ptr, start_routine, parent); | |
976 | assert(!error || error == EAGAIN); | |
977 | if(error) | |
978 | parent = NULL; | |
979 | ||
980 | if(attr_ptr) | |
981 | PTHREAD_VERIFY(pthread_attr_destroy(attr_ptr)); | |
982 | ||
983 | return parent; | |
984 | } | |
985 | ||
986 | return NULL; | |
987 | } | |
988 | ||
989 | int io_thread_return(struct io_thread *self) | |
990 | { | |
991 | if(_has_pthread >= 0) | |
992 | { | |
993 | enum _io_thread_status old_status = _status_exchange(&self->status, _io_thread_done); | |
994 | assert(old_status == _io_thread_working || | |
995 | old_status == _io_thread_cancelled); | |
996 | return old_status != _io_thread_working; | |
997 | } | |
998 | ||
999 | return 0; | |
1000 | } | |
1001 | ||
1002 | int io_thread_is_done(struct io_thread *self) | |
1003 | { | |
1004 | if(_has_pthread >= 0) | |
1005 | { | |
1006 | int result = _status_load(&self->status); | |
1007 | assert(result != _io_thread_cancelled); | |
1008 | return result; | |
1009 | } | |
1010 | return 1; | |
1011 | } | |
1012 | ||
1013 | int io_thread_cancel(struct io_thread *self) | |
1014 | { | |
1015 | if(_has_pthread >= 0) | |
1016 | { | |
1017 | enum _io_thread_status old_status = | |
1018 | _status_exchange(&self->status, _io_thread_cancelled); | |
1019 | assert(old_status == _io_thread_working || | |
1020 | old_status == _io_thread_done); | |
1021 | ||
1022 | PTHREAD_VERIFY(pthread_detach(self->thread)); | |
1023 | return old_status != _io_thread_working; | |
1024 | } | |
1025 | ||
1026 | return 0; | |
1027 | } | |
1028 | ||
1029 | void io_thread_finish(struct io_thread *self) | |
1030 | { | |
1031 | if(_has_pthread >= 0) | |
1032 | { | |
1033 | # ifndef NDEBUG | |
1034 | enum _io_thread_status status = _status_load(&self->status); | |
1035 | assert(status == _io_thread_working || | |
1036 | status == _io_thread_done); | |
1037 | # endif | |
1038 | PTHREAD_VERIFY(pthread_join(self->thread, NULL)); | |
1039 | assert(_status_load(&self->status) == _io_thread_done); | |
1040 | } | |
1041 | } | |
1042 | ||
1043 | #endif /* HAVE_PTHREAD */ |