Commit | Line | Data |
---|---|---|
3144ee8a AT |
1 | /* -*- mode: c; tab-width: 4; fill-column: 78 -*- */ |
2 | /* vi: set ts=4 tw=78: */ | |
3 | ||
4 | /* | |
5 | thread_util.h, Copyright (c) 2014 Dave Odell <dmo2118@gmail.com> | |
6 | ||
7 | Permission to use, copy, modify, distribute, and sell this software and its | |
8 | documentation for any purpose is hereby granted without fee, provided that | |
9 | the above copyright notice appear in all copies and that both that | |
10 | copyright notice and this permission notice appear in supporting | |
11 | documentation. No representations are made about the suitability of this | |
12 | software for any purpose. It is provided "as is" without express or | |
13 | implied warranty. | |
14 | */ | |
15 | ||
16 | #ifndef THREAD_UTIL_H | |
17 | #define THREAD_UTIL_H | |
18 | ||
19 | /* thread_util.h because C11 took threads.h. */ | |
20 | ||
21 | /* And POSIX threads because there aren't too many systems that support C11 | |
22 | threads that don't already support POSIX threads. | |
23 | ...Not that it would be too hard to convert from the one to the other. | |
24 | Or to have both. | |
25 | */ | |
26 | ||
27 | /* Beware! | |
28 | Multithreading is a great way to add insidious and catastrophic bugs to | |
29 | a program. Make sure you understand the risks. | |
30 | ||
31 | You may wish to become familiar with race conditions, deadlocks, mutexes, | |
32 | condition variables, and, in lock-free code, memory ordering, cache | |
33 | hierarchies, etc., before working with threads. | |
34 | ||
35 | On the other hand, if a screenhack locks up or crashes, it's not the | |
36 | end of the world: XScreenSaver won't unlock the screen if that happens. | |
37 | */ | |
38 | ||
39 | /* | |
40 | The basic stragegy for applying threads to a CPU-hungry screenhack: | |
41 | ||
42 | 1. Find the CPU-hungry part of the hack. | |
43 | ||
44 | 2. Change that part so the workload can be divided into N equal-sized | |
45 | loads, where N is the number of CPU cores in the machine. | |
46 | (For example: with two cores, one core could render even scan lines, | |
47 | and the other odd scan lines.) | |
48 | ||
49 | 2a. Keeping in mind that two threads should not write to the same memory | |
50 | at the same time. Specifically, they should not be writing to the | |
51 | same cache line at the same time -- so align memory allocation and | |
52 | memory accesses to the system cache line size as necessary. | |
53 | ||
54 | 3. On screenhack_init, create a threadpool object. This creates N worker | |
55 | threads, and each thread creates and owns a user-defined struct. | |
56 | After creation, the threads are idle. | |
57 | ||
58 | 4. On screenhack_frame, call threadpool_run(). Each thread simultaneously | |
59 | wakes up, calls a function that does one of the equal-sized loads, | |
60 | then goes back to sleep. The main thread then calls threadpool_wait(), | |
61 | which returns once all the worker threads have finished. | |
62 | ||
63 | Using this to implement SMP won't necessarily increase performance by | |
64 | a factor of N (again, N is CPU cores.). Both X11 and Cocoa on OS X can | |
65 | impose a not-insignificant amount of overhead even when simply blitting | |
66 | full-screen XImages @ 30 FPS. | |
67 | ||
68 | On systems with simultaneous multithreading (a.k.a. Hyper-threading), | |
69 | performance gains may be slim to non-existant. | |
70 | */ | |
71 | ||
72 | #include "aligned_malloc.h" | |
73 | ||
74 | #if HAVE_CONFIG_H | |
75 | /* For HAVE_PTHREAD. */ | |
76 | # include "config.h" | |
77 | #endif | |
78 | ||
79 | #include <stddef.h> | |
80 | ||
81 | #if HAVE_UNISTD_H | |
82 | /* For _POSIX_THREADS. */ | |
83 | # include <unistd.h> | |
84 | #endif | |
85 | ||
86 | #if defined HAVE_JWXYZ | |
87 | # include "jwxyz.h" | |
88 | #else | |
89 | # include <X11/Xlib.h> | |
90 | #endif | |
91 | ||
92 | #if HAVE_PTHREAD | |
93 | int threads_available(Display *dpy); | |
94 | #else | |
95 | # define threads_available(dpy) (-1) | |
96 | #endif | |
97 | /* > 0: Threads are available. This is normally _POSIX_VERSION. | |
98 | -1: Threads are not available. | |
99 | */ | |
100 | ||
101 | unsigned hardware_concurrency(Display *dpy); | |
102 | /* This is supposed to return the number of available CPU cores. This number | |
103 | isn't necessarily constant: a system administrator can hotplug or | |
104 | enable/disable CPUs on certain systems, or the system can deactivate a | |
105 | malfunctioning core -- but these are rare. | |
106 | ||
107 | If threads are unavailable, this function will return 1. | |
108 | ||
109 | This function isn't fast; the result should be cached. | |
110 | */ | |
111 | ||
112 | unsigned thread_memory_alignment(Display *dpy); | |
113 | ||
114 | /* Returns the proper alignment for memory allocated by a thread that is | |
115 | shared with other threads. | |
116 | ||
117 | A typical CPU accesses the system RAM through a cache, and this cache is | |
118 | divided up into cache lines - aligned chunks of memory typically 32 or 64 | |
119 | bytes in size. Cache faults cause cache lines to be populated from | |
120 | memory. And, in a multiprocessing environment, two CPU cores can access the | |
121 | same cache line. The consequences of this depend on the CPU model: | |
122 | ||
123 | - x86 implements the MESI protocol [1] to maintain cache coherency between | |
124 | CPU cores, with a serious performance penalty on both Intel [1] and AMD | |
125 | [2]. Intel uses the term "false sharing" to describe two CPU cores | |
126 | accessing different memory in the same cache line. | |
127 | ||
128 | - ARM allows CPU caches to become inconsistent in this case [3]. Memory | |
129 | fences are needed to prevent horrible non-deterministic bugs from | |
130 | occurring. Other CPU architectures have similar behavior to one of the | |
131 | above, depending on whether they are "strongly-orderered" (like x86), or | |
132 | "weakly-ordered" (like ARM). | |
133 | ||
134 | Aligning multithreaded memory accesses according to the cache line size | |
135 | neatly sidesteps both issues. | |
136 | ||
137 | One complication is that CPU caches are divided up into separate levels, | |
138 | and occasionally different levels can have different cache line sizes, so | |
139 | to be safe this function returns the largest cache line size among all | |
140 | levels. | |
141 | ||
142 | If multithreading is not in effect, this returns sizeof(void *), because | |
143 | posix_memalign(3) will error out if the alignment is set to be smaller than | |
144 | that. | |
145 | ||
146 | [1] Intel(R) 64 and IA-32 Architectures Optimization Reference Manual | |
147 | (Order Number: 248966-026): 2.1.5 Cache Hierarchy | |
148 | [2] Software Optimization Guide for AMD Family 10h Processors (Publication | |
149 | #40546): 11.3.4 Data Sharing between Caches | |
150 | [3] http://wanderingcoder.net/2011/04/01/arm-memory-ordering/ | |
151 | */ | |
152 | ||
153 | /* | |
154 | Note: aligned_malloc uses posix_memalign(3) when available, or malloc(3) | |
155 | otherwise. As of SUSv2 (1997), and *probably* earlier, these are guaranteed | |
156 | to be thread-safe. C89 does not discuss threads, or thread safety; | |
157 | non-POSIX systems, watch out! | |
158 | http://pubs.opengroup.org/onlinepubs/7908799/xsh/threads.html | |
159 | http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_09.html | |
160 | */ | |
161 | ||
162 | /* int thread_malloc(void **ptr, Display *dpy, unsigned size); */ | |
163 | #define thread_malloc(ptr, dpy, size) \ | |
164 | (aligned_malloc((ptr), thread_memory_alignment(dpy), (size))) | |
165 | ||
166 | /* | |
167 | This simply does a malloc aligned to thread_memory_alignment(). See | |
168 | above. On failure, an errno is returned, usually ENOMEM. | |
169 | ||
170 | It's possible for two malloc()'d blocks to at least partially share the | |
171 | same cache line. When a different thread is writing to each block, then bad | |
172 | things can happen (see thread_memory_alignment). Better malloc() | |
173 | implementations will divide memory into pools belonging to one thread or | |
174 | another, causing memory blocks belonging to different threads to typically | |
175 | be located on different memory pages (see getpagesize(2)), mitigating the | |
176 | problem in question...but there's nothing stopping threads from passing | |
177 | memory to each other. And it's not practical for the system to align each | |
178 | block to 64 or 128 byte boundaries -- it's not uncommon to need lots and | |
179 | lots of 8-32 byte allocations, and the waste could become a bit excessive. | |
180 | ||
181 | Some rules of thumb to take away from this: | |
182 | ||
183 | 1. Use thread_alloc for memory that might be written to by a thread that | |
184 | didn't originally allocate the object. | |
185 | ||
186 | 2. Use thread_alloc for memory that will be handed from one thread to | |
187 | another. | |
188 | ||
189 | 3. Use malloc if a single thread allocates, reads from, writes to, and | |
190 | frees the block of memory. | |
191 | ||
192 | Oddly, I (Dave) have not seen this problem described anywhere else. | |
193 | */ | |
194 | ||
195 | #define thread_free(ptr) aligned_free(ptr) | |
196 | ||
197 | #if HAVE_PTHREAD | |
198 | # if defined _POSIX_THREADS && _POSIX_THREADS >= 0 | |
199 | /* | |
200 | See The Open Group Base Specifications Issue 7, <unistd.h>, Constants for | |
201 | Options and Option Groups | |
202 | http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/unistd.h.html#tag_13_77_03_02 | |
203 | */ | |
204 | ||
205 | # include <pthread.h> | |
206 | ||
207 | /* Most PThread synchronization functions only fail when they are misused. */ | |
208 | # if defined NDEBUG | |
209 | # define PTHREAD_VERIFY(expr) (void)(expr) | |
210 | # else | |
211 | # include <assert.h> | |
212 | # define PTHREAD_VERIFY(expr) assert(!(expr)) | |
213 | # endif | |
214 | ||
215 | extern const pthread_mutex_t mutex_initializer; | |
216 | extern const pthread_cond_t cond_initializer; | |
217 | ||
218 | # else | |
219 | /* Whatever caused HAVE_PTHREAD to be defined (configure script, | |
220 | usually) made a mistake if this is reached. */ | |
221 | /* Maybe this should be a warning. */ | |
222 | # error HAVE_PTHREAD is defined, but _POSIX_THREADS is not. | |
223 | /* #undef HAVE_PTHREAD */ | |
224 | # endif | |
225 | #endif | |
226 | ||
227 | struct threadpool | |
228 | { | |
229 | /* This is always the same as the count parameter fed to threadpool_create(). | |
230 | Here's a neat trick: if the threadpool is zeroed out with a memset, and | |
231 | threadpool_create() is never called to create 0 threads, then | |
232 | threadpool::count can be used to determine if the threadpool object was | |
233 | ever initialized. */ | |
234 | unsigned count; | |
235 | ||
236 | /* Copied from threadpool_class. No need for thread_create here, though. */ | |
237 | size_t thread_size; | |
238 | void (*thread_run)(void *self); | |
239 | void (*thread_destroy)(void *self); | |
240 | ||
241 | void *serial_threads; | |
242 | ||
243 | #if HAVE_PTHREAD | |
244 | pthread_mutex_t mutex; | |
245 | pthread_cond_t cond; | |
246 | ||
247 | /* Number of threads waiting for the startup signal. */ | |
248 | unsigned parallel_pending; | |
249 | ||
250 | /* Number of threads still running. During startup, this is the index of the thread currently being initialized. */ | |
251 | unsigned parallel_unfinished; | |
252 | ||
253 | pthread_t *parallel_threads; | |
254 | #endif | |
255 | }; | |
256 | ||
257 | /* | |
258 | The threadpool_* functions manage a group of threads (naturally). Each | |
259 | thread owns an object described by a threadpool_class. When | |
260 | threadpool_run() is called, the specified func parameter is called on each | |
261 | thread in parallel. Sometime after calling threadpool_run(), call | |
262 | threadpool_wait(), which waits for each thread to return from | |
263 | threadpool_class::run(). | |
264 | ||
265 | Note that thread 0 runs on the thread from which threadpool_run is called | |
266 | from, so if each thread has an equal workload, then when threadpool_run | |
267 | returns, the other threads will be finished or almost finished. Adding code | |
268 | between threadpool_run and threadpool_wait increases the odds that | |
269 | threadpool_wait won't actually have to wait at all -- which is nice. | |
270 | ||
271 | If the system does not provide threads, then these functions will fake it: | |
272 | everything will appear to work normally from the perspective of the caller, | |
273 | but when threadpool_run() is called, the "threads" are run synchronously; | |
274 | threadpool_wait() does nothing. | |
275 | */ | |
276 | ||
277 | struct threadpool_class | |
278 | { | |
279 | /* Size of the thread private object. */ | |
280 | size_t size; | |
281 | ||
282 | /* Create the thread private object. Called in sequence for each thread | |
283 | (effectively) from threadpool_create. | |
284 | self: A pointer to size bytes of memory, allocated to hold the thread | |
285 | object. | |
286 | pool: The threadpool object that owns all the threads. If the threadpool | |
287 | is nested in another struct, try GET_PARENT_OBJ. | |
288 | id: The ID for the thread; numbering starts at zero and goes up by one | |
289 | for each thread. | |
290 | Return 0 on success. On failure, return a value from errno.h; this will | |
291 | be returned from threadpool_create. */ | |
292 | int (*create)(void *self, struct threadpool *pool, unsigned id); | |
293 | ||
294 | /* Destroys the thread private object. Called in sequence (though not always | |
295 | the same sequence as create). Warning: During shutdown, it is possible | |
296 | for destroy() to be called while other threads are still in | |
297 | threadpool_run(). */ | |
298 | void (*destroy)(void *self); | |
299 | }; | |
300 | ||
301 | /* Returns 0 on success, on failure can return ENOMEM, or any error code from | |
302 | threadpool_class.create. */ | |
303 | int threadpool_create(struct threadpool *self, const struct threadpool_class *cls, Display *dpy, unsigned count); | |
304 | void threadpool_destroy(struct threadpool *self); | |
305 | ||
306 | void threadpool_run(struct threadpool *self, void (*func)(void *)); | |
307 | void threadpool_wait(struct threadpool *self); | |
308 | ||
309 | /* | |
310 | io_thread is meant to wrap blocking I/O operations in a one-shot worker | |
311 | thread, with cancel semantics. | |
312 | ||
313 | Unlike threadpool_*, io_thread will not 'fake it'; it is up to the caller | |
314 | to figure out what to do if the system doesn't have threads. In | |
315 | particular, the start_routine passed to io_thread_create will never be | |
316 | called. | |
317 | ||
318 | Clients of io_thread implement four functions: | |
319 | - state *process_start(...); | |
320 | Starts the worker thread. | |
321 | - bool process_is_done(state *); | |
322 | Returns true if the I/O operation is complete. | |
323 | - void process_cancel(state *); | |
324 | "Cancels" the I/O operation. The thread will continue to run, but it | |
325 | will detach, and clean itself up upon completion. | |
326 | - int process_finish(state *, ...) | |
327 | Waits for the I/O operation to complete, returns results, and cleans up. | |
328 | ||
329 | Or: /---\ | |
330 | \/ | /--> cancel | |
331 | start -> is_done --+ | |
332 | \--> finish | |
333 | ||
334 | These functions follow a basic pattern: | |
335 | - start: | |
336 | 1. Allocate a thread state object with thread_alloc. This state object | |
337 | contains an io_thread member. | |
338 | 2. Save parameters from the start parameters to the state object. | |
339 | 3. Start the thread with _io_thread_create. The thread receives the state | |
340 | object as its parameter. | |
341 | - On the worker thread: | |
342 | 1. Do the I/O. | |
343 | 2. Call io_thread_return. | |
344 | 2a. If the result != 0, free the state object. | |
345 | - is_done: | |
346 | 1. Just call _io_thread_is_done. | |
347 | - cancel: | |
348 | 1. Call io_thread_cancel. | |
349 | 1a. If the result != 0, free the state object. | |
350 | - finish: | |
351 | 1. Call io_thread_finish. | |
352 | 2. Copy results out of the state object as needed. | |
353 | 3. Free the state object...or return it to the caller. | |
354 | ||
355 | Incidentally, there may sometimes be asynchronous versions of blocking I/O | |
356 | functions (struct aiocb and friends, for example); these should be | |
357 | preferred over io_thread when performance is a concern. | |
358 | */ | |
359 | ||
360 | enum _io_thread_status | |
361 | { | |
362 | _io_thread_working, _io_thread_done, _io_thread_cancelled | |
363 | }; | |
364 | ||
365 | struct io_thread | |
366 | { | |
367 | #if HAVE_PTHREAD | |
368 | /* Common misconception: "volatile" should be applied to atomic variables, | |
369 | such as 'status', below. This is false, see | |
370 | <http://stackoverflow.com/q/2484980>. */ | |
371 | enum _io_thread_status status; | |
372 | pthread_t thread; | |
373 | #else | |
374 | char gcc_emits_a_warning_when_the_struct_has_no_members; | |
375 | #endif | |
376 | }; | |
377 | ||
378 | #if HAVE_PTHREAD | |
379 | ||
380 | void *io_thread_create(struct io_thread *self, void *parent, void *(*start_routine)(void *), Display *dpy, unsigned stacksize); | |
381 | /* | |
382 | Create the thread, returns NULL on failure. Failure is usually due to | |
383 | ENOMEM, or the system doesn't support threads. | |
384 | self: The io_thread object to be initialized. | |
385 | parent: The parameter to start_routine. The io_thread should be | |
386 | contained within or be reachable from this. | |
387 | start_routine: The start routine for the worker thread. | |
388 | dpy: The X11 Display, so that '*useThreads' is honored. | |
389 | stacksize: The stack size for the thread. Set to 0 for the system | |
390 | default. | |
391 | A note about stacksize: Linux, for example, uses a default of 2 MB of | |
392 | stack per thread. Now, this memory is usually committed on the first | |
393 | write, so lots of threads won't waste RAM, but it does mean that on a | |
394 | 32-bit system, there's a limit of just under 1024 threads with the 2 MB | |
395 | default due to typical address space limitations of 2 GB for userspace | |
396 | processes. And 1024 threads might not always be enough... | |
397 | */ | |
398 | ||
399 | int io_thread_return(struct io_thread *self); | |
400 | /* Called at the end of start_routine, from above. Returns non-zero if the | |
401 | thread has been cancelled, and cleanup needs to take place. */ | |
402 | ||
403 | int io_thread_is_done(struct io_thread *self); | |
404 | /* Call from the main thread. Returns non-zero if the thread finished. */ | |
405 | ||
406 | int io_thread_cancel(struct io_thread *self); | |
407 | /* Call from the main thread if the results from the worker thread are not | |
408 | needed. This cleans up the io_thread. Returns non-zero if cleanup needs | |
409 | to take place. */ | |
410 | ||
411 | void io_thread_finish(struct io_thread *self); | |
412 | /* Call from the main thread to wait for the worker thread to finish. This | |
413 | cleans up the io_thread. */ | |
414 | ||
415 | #else | |
416 | ||
417 | #define IO_THREAD_STACK_MIN 0 | |
418 | ||
419 | #define io_thread_create(self, parent, start_routine, dpy, stacksize) NULL | |
420 | #define io_thread_return(self) 0 | |
421 | #define io_thread_is_done(self) 1 | |
422 | #define io_thread_cancel(self) 0 | |
423 | #define io_thread_finish(self) | |
424 | ||
425 | #endif | |
426 | ||
427 | #if HAVE_PTHREAD | |
428 | # define THREAD_DEFAULTS "*useThreads: True", | |
429 | # define THREAD_DEFAULTS_XLOCK "*useThreads: True\n" | |
430 | # define THREAD_OPTIONS \ | |
431 | {"-threads", ".useThreads", XrmoptionNoArg, "True"}, \ | |
432 | {"-no-threads", ".useThreads", XrmoptionNoArg, "False"}, | |
433 | #else | |
434 | # define THREAD_DEFAULTS | |
435 | # define THREAD_DEFAULTS_XLOCK | |
436 | # define THREAD_OPTIONS | |
437 | #endif | |
438 | ||
439 | /* | |
440 | If a variable 'member' is known to be a member (named 'member_name') of a | |
441 | struct (named 'struct_name'), then this can find a pointer to the struct | |
442 | that contains it. | |
443 | */ | |
444 | #define GET_PARENT_OBJ(struct_name, member_name, member) (struct_name *)((char *)member - offsetof(struct_name, member_name)); | |
445 | ||
446 | #endif |