| 1 | /* -*- mode: c; tab-width: 4; fill-column: 78 -*- */ |
| 2 | /* vi: set ts=4 tw=78: */ |
| 3 | |
| 4 | /* |
| 5 | thread_util.h, Copyright (c) 2014 Dave Odell <dmo2118@gmail.com> |
| 6 | |
| 7 | Permission to use, copy, modify, distribute, and sell this software and its |
| 8 | documentation for any purpose is hereby granted without fee, provided that |
| 9 | the above copyright notice appear in all copies and that both that |
| 10 | copyright notice and this permission notice appear in supporting |
| 11 | documentation. No representations are made about the suitability of this |
| 12 | software for any purpose. It is provided "as is" without express or |
| 13 | implied warranty. |
| 14 | */ |
| 15 | |
| 16 | #ifndef THREAD_UTIL_H |
| 17 | #define THREAD_UTIL_H |
| 18 | |
| 19 | /* thread_util.h because C11 took threads.h. */ |
| 20 | |
| 21 | /* And POSIX threads because there aren't too many systems that support C11 |
| 22 | threads that don't already support POSIX threads. |
| 23 | ...Not that it would be too hard to convert from the one to the other. |
| 24 | Or to have both. |
| 25 | */ |
| 26 | |
| 27 | /* Beware! |
| 28 | Multithreading is a great way to add insidious and catastrophic bugs to |
| 29 | a program. Make sure you understand the risks. |
| 30 | |
| 31 | You may wish to become familiar with race conditions, deadlocks, mutexes, |
| 32 | condition variables, and, in lock-free code, memory ordering, cache |
| 33 | hierarchies, etc., before working with threads. |
| 34 | |
| 35 | On the other hand, if a screenhack locks up or crashes, it's not the |
| 36 | end of the world: XScreenSaver won't unlock the screen if that happens. |
| 37 | */ |
| 38 | |
| 39 | /* |
| 40 | The basic stragegy for applying threads to a CPU-hungry screenhack: |
| 41 | |
| 42 | 1. Find the CPU-hungry part of the hack. |
| 43 | |
| 44 | 2. Change that part so the workload can be divided into N equal-sized |
| 45 | loads, where N is the number of CPU cores in the machine. |
| 46 | (For example: with two cores, one core could render even scan lines, |
| 47 | and the other odd scan lines.) |
| 48 | |
| 49 | 2a. Keeping in mind that two threads should not write to the same memory |
| 50 | at the same time. Specifically, they should not be writing to the |
| 51 | same cache line at the same time -- so align memory allocation and |
| 52 | memory accesses to the system cache line size as necessary. |
| 53 | |
| 54 | 3. On screenhack_init, create a threadpool object. This creates N worker |
| 55 | threads, and each thread creates and owns a user-defined struct. |
| 56 | After creation, the threads are idle. |
| 57 | |
| 58 | 4. On screenhack_frame, call threadpool_run(). Each thread simultaneously |
| 59 | wakes up, calls a function that does one of the equal-sized loads, |
| 60 | then goes back to sleep. The main thread then calls threadpool_wait(), |
| 61 | which returns once all the worker threads have finished. |
| 62 | |
| 63 | Using this to implement SMP won't necessarily increase performance by |
| 64 | a factor of N (again, N is CPU cores.). Both X11 and Cocoa on OS X can |
| 65 | impose a not-insignificant amount of overhead even when simply blitting |
| 66 | full-screen XImages @ 30 FPS. |
| 67 | |
| 68 | On systems with simultaneous multithreading (a.k.a. Hyper-threading), |
| 69 | performance gains may be slim to non-existant. |
| 70 | */ |
| 71 | |
| 72 | #include "aligned_malloc.h" |
| 73 | |
| 74 | #if HAVE_CONFIG_H |
| 75 | /* For HAVE_PTHREAD. */ |
| 76 | # include "config.h" |
| 77 | #endif |
| 78 | |
| 79 | #include <stddef.h> |
| 80 | |
| 81 | #if HAVE_UNISTD_H |
| 82 | /* For _POSIX_THREADS. */ |
| 83 | # include <unistd.h> |
| 84 | #endif |
| 85 | |
| 86 | #if defined HAVE_JWXYZ |
| 87 | # include "jwxyz.h" |
| 88 | #else |
| 89 | # include <X11/Xlib.h> |
| 90 | #endif |
| 91 | |
| 92 | #if HAVE_PTHREAD |
| 93 | int threads_available(Display *dpy); |
| 94 | #else |
| 95 | # define threads_available(dpy) (-1) |
| 96 | #endif |
| 97 | /* > 0: Threads are available. This is normally _POSIX_VERSION. |
| 98 | -1: Threads are not available. |
| 99 | */ |
| 100 | |
| 101 | unsigned hardware_concurrency(Display *dpy); |
| 102 | /* This is supposed to return the number of available CPU cores. This number |
| 103 | isn't necessarily constant: a system administrator can hotplug or |
| 104 | enable/disable CPUs on certain systems, or the system can deactivate a |
| 105 | malfunctioning core -- but these are rare. |
| 106 | |
| 107 | If threads are unavailable, this function will return 1. |
| 108 | |
| 109 | This function isn't fast; the result should be cached. |
| 110 | */ |
| 111 | |
| 112 | unsigned thread_memory_alignment(Display *dpy); |
| 113 | |
| 114 | /* Returns the proper alignment for memory allocated by a thread that is |
| 115 | shared with other threads. |
| 116 | |
| 117 | A typical CPU accesses the system RAM through a cache, and this cache is |
| 118 | divided up into cache lines - aligned chunks of memory typically 32 or 64 |
| 119 | bytes in size. Cache faults cause cache lines to be populated from |
| 120 | memory. And, in a multiprocessing environment, two CPU cores can access the |
| 121 | same cache line. The consequences of this depend on the CPU model: |
| 122 | |
| 123 | - x86 implements the MESI protocol [1] to maintain cache coherency between |
| 124 | CPU cores, with a serious performance penalty on both Intel [1] and AMD |
| 125 | [2]. Intel uses the term "false sharing" to describe two CPU cores |
| 126 | accessing different memory in the same cache line. |
| 127 | |
| 128 | - ARM allows CPU caches to become inconsistent in this case [3]. Memory |
| 129 | fences are needed to prevent horrible non-deterministic bugs from |
| 130 | occurring. Other CPU architectures have similar behavior to one of the |
| 131 | above, depending on whether they are "strongly-orderered" (like x86), or |
| 132 | "weakly-ordered" (like ARM). |
| 133 | |
| 134 | Aligning multithreaded memory accesses according to the cache line size |
| 135 | neatly sidesteps both issues. |
| 136 | |
| 137 | One complication is that CPU caches are divided up into separate levels, |
| 138 | and occasionally different levels can have different cache line sizes, so |
| 139 | to be safe this function returns the largest cache line size among all |
| 140 | levels. |
| 141 | |
| 142 | If multithreading is not in effect, this returns sizeof(void *), because |
| 143 | posix_memalign(3) will error out if the alignment is set to be smaller than |
| 144 | that. |
| 145 | |
| 146 | [1] Intel(R) 64 and IA-32 Architectures Optimization Reference Manual |
| 147 | (Order Number: 248966-026): 2.1.5 Cache Hierarchy |
| 148 | [2] Software Optimization Guide for AMD Family 10h Processors (Publication |
| 149 | #40546): 11.3.4 Data Sharing between Caches |
| 150 | [3] http://wanderingcoder.net/2011/04/01/arm-memory-ordering/ |
| 151 | */ |
| 152 | |
| 153 | /* |
| 154 | Note: aligned_malloc uses posix_memalign(3) when available, or malloc(3) |
| 155 | otherwise. As of SUSv2 (1997), and *probably* earlier, these are guaranteed |
| 156 | to be thread-safe. C89 does not discuss threads, or thread safety; |
| 157 | non-POSIX systems, watch out! |
| 158 | http://pubs.opengroup.org/onlinepubs/7908799/xsh/threads.html |
| 159 | http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_09.html |
| 160 | */ |
| 161 | |
| 162 | /* int thread_malloc(void **ptr, Display *dpy, unsigned size); */ |
| 163 | #define thread_malloc(ptr, dpy, size) \ |
| 164 | (aligned_malloc((ptr), thread_memory_alignment(dpy), (size))) |
| 165 | |
| 166 | /* |
| 167 | This simply does a malloc aligned to thread_memory_alignment(). See |
| 168 | above. On failure, an errno is returned, usually ENOMEM. |
| 169 | |
| 170 | It's possible for two malloc()'d blocks to at least partially share the |
| 171 | same cache line. When a different thread is writing to each block, then bad |
| 172 | things can happen (see thread_memory_alignment). Better malloc() |
| 173 | implementations will divide memory into pools belonging to one thread or |
| 174 | another, causing memory blocks belonging to different threads to typically |
| 175 | be located on different memory pages (see getpagesize(2)), mitigating the |
| 176 | problem in question...but there's nothing stopping threads from passing |
| 177 | memory to each other. And it's not practical for the system to align each |
| 178 | block to 64 or 128 byte boundaries -- it's not uncommon to need lots and |
| 179 | lots of 8-32 byte allocations, and the waste could become a bit excessive. |
| 180 | |
| 181 | Some rules of thumb to take away from this: |
| 182 | |
| 183 | 1. Use thread_alloc for memory that might be written to by a thread that |
| 184 | didn't originally allocate the object. |
| 185 | |
| 186 | 2. Use thread_alloc for memory that will be handed from one thread to |
| 187 | another. |
| 188 | |
| 189 | 3. Use malloc if a single thread allocates, reads from, writes to, and |
| 190 | frees the block of memory. |
| 191 | |
| 192 | Oddly, I (Dave) have not seen this problem described anywhere else. |
| 193 | */ |
| 194 | |
| 195 | #define thread_free(ptr) aligned_free(ptr) |
| 196 | |
| 197 | #if HAVE_PTHREAD |
| 198 | # if defined _POSIX_THREADS && _POSIX_THREADS >= 0 |
| 199 | /* |
| 200 | See The Open Group Base Specifications Issue 7, <unistd.h>, Constants for |
| 201 | Options and Option Groups |
| 202 | http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/unistd.h.html#tag_13_77_03_02 |
| 203 | */ |
| 204 | |
| 205 | # include <pthread.h> |
| 206 | |
| 207 | /* Most PThread synchronization functions only fail when they are misused. */ |
| 208 | # if defined NDEBUG |
| 209 | # define PTHREAD_VERIFY(expr) (void)(expr) |
| 210 | # else |
| 211 | # include <assert.h> |
| 212 | # define PTHREAD_VERIFY(expr) assert(!(expr)) |
| 213 | # endif |
| 214 | |
| 215 | extern const pthread_mutex_t mutex_initializer; |
| 216 | extern const pthread_cond_t cond_initializer; |
| 217 | |
| 218 | # else |
| 219 | /* Whatever caused HAVE_PTHREAD to be defined (configure script, |
| 220 | usually) made a mistake if this is reached. */ |
| 221 | /* Maybe this should be a warning. */ |
| 222 | # error HAVE_PTHREAD is defined, but _POSIX_THREADS is not. |
| 223 | /* #undef HAVE_PTHREAD */ |
| 224 | # endif |
| 225 | #endif |
| 226 | |
| 227 | struct threadpool |
| 228 | { |
| 229 | /* This is always the same as the count parameter fed to threadpool_create(). |
| 230 | Here's a neat trick: if the threadpool is zeroed out with a memset, and |
| 231 | threadpool_create() is never called to create 0 threads, then |
| 232 | threadpool::count can be used to determine if the threadpool object was |
| 233 | ever initialized. */ |
| 234 | unsigned count; |
| 235 | |
| 236 | /* Copied from threadpool_class. No need for thread_create here, though. */ |
| 237 | size_t thread_size; |
| 238 | void (*thread_run)(void *self); |
| 239 | void (*thread_destroy)(void *self); |
| 240 | |
| 241 | void *serial_threads; |
| 242 | |
| 243 | #if HAVE_PTHREAD |
| 244 | pthread_mutex_t mutex; |
| 245 | pthread_cond_t cond; |
| 246 | |
| 247 | /* Number of threads waiting for the startup signal. */ |
| 248 | unsigned parallel_pending; |
| 249 | |
| 250 | /* Number of threads still running. During startup, this is the index of the thread currently being initialized. */ |
| 251 | unsigned parallel_unfinished; |
| 252 | |
| 253 | pthread_t *parallel_threads; |
| 254 | #endif |
| 255 | }; |
| 256 | |
| 257 | /* |
| 258 | The threadpool_* functions manage a group of threads (naturally). Each |
| 259 | thread owns an object described by a threadpool_class. When |
| 260 | threadpool_run() is called, the specified func parameter is called on each |
| 261 | thread in parallel. Sometime after calling threadpool_run(), call |
| 262 | threadpool_wait(), which waits for each thread to return from |
| 263 | threadpool_class::run(). |
| 264 | |
| 265 | Note that thread 0 runs on the thread from which threadpool_run is called |
| 266 | from, so if each thread has an equal workload, then when threadpool_run |
| 267 | returns, the other threads will be finished or almost finished. Adding code |
| 268 | between threadpool_run and threadpool_wait increases the odds that |
| 269 | threadpool_wait won't actually have to wait at all -- which is nice. |
| 270 | |
| 271 | If the system does not provide threads, then these functions will fake it: |
| 272 | everything will appear to work normally from the perspective of the caller, |
| 273 | but when threadpool_run() is called, the "threads" are run synchronously; |
| 274 | threadpool_wait() does nothing. |
| 275 | */ |
| 276 | |
| 277 | struct threadpool_class |
| 278 | { |
| 279 | /* Size of the thread private object. */ |
| 280 | size_t size; |
| 281 | |
| 282 | /* Create the thread private object. Called in sequence for each thread |
| 283 | (effectively) from threadpool_create. |
| 284 | self: A pointer to size bytes of memory, allocated to hold the thread |
| 285 | object. |
| 286 | pool: The threadpool object that owns all the threads. If the threadpool |
| 287 | is nested in another struct, try GET_PARENT_OBJ. |
| 288 | id: The ID for the thread; numbering starts at zero and goes up by one |
| 289 | for each thread. |
| 290 | Return 0 on success. On failure, return a value from errno.h; this will |
| 291 | be returned from threadpool_create. */ |
| 292 | int (*create)(void *self, struct threadpool *pool, unsigned id); |
| 293 | |
| 294 | /* Destroys the thread private object. Called in sequence (though not always |
| 295 | the same sequence as create). Warning: During shutdown, it is possible |
| 296 | for destroy() to be called while other threads are still in |
| 297 | threadpool_run(). */ |
| 298 | void (*destroy)(void *self); |
| 299 | }; |
| 300 | |
| 301 | /* Returns 0 on success, on failure can return ENOMEM, or any error code from |
| 302 | threadpool_class.create. */ |
| 303 | int threadpool_create(struct threadpool *self, const struct threadpool_class *cls, Display *dpy, unsigned count); |
| 304 | void threadpool_destroy(struct threadpool *self); |
| 305 | |
| 306 | void threadpool_run(struct threadpool *self, void (*func)(void *)); |
| 307 | void threadpool_wait(struct threadpool *self); |
| 308 | |
| 309 | /* |
| 310 | io_thread is meant to wrap blocking I/O operations in a one-shot worker |
| 311 | thread, with cancel semantics. |
| 312 | |
| 313 | Unlike threadpool_*, io_thread will not 'fake it'; it is up to the caller |
| 314 | to figure out what to do if the system doesn't have threads. In |
| 315 | particular, the start_routine passed to io_thread_create will never be |
| 316 | called. |
| 317 | |
| 318 | Clients of io_thread implement four functions: |
| 319 | - state *process_start(...); |
| 320 | Starts the worker thread. |
| 321 | - bool process_is_done(state *); |
| 322 | Returns true if the I/O operation is complete. |
| 323 | - void process_cancel(state *); |
| 324 | "Cancels" the I/O operation. The thread will continue to run, but it |
| 325 | will detach, and clean itself up upon completion. |
| 326 | - int process_finish(state *, ...) |
| 327 | Waits for the I/O operation to complete, returns results, and cleans up. |
| 328 | |
| 329 | Or: /---\ |
| 330 | \/ | /--> cancel |
| 331 | start -> is_done --+ |
| 332 | \--> finish |
| 333 | |
| 334 | These functions follow a basic pattern: |
| 335 | - start: |
| 336 | 1. Allocate a thread state object with thread_alloc. This state object |
| 337 | contains an io_thread member. |
| 338 | 2. Save parameters from the start parameters to the state object. |
| 339 | 3. Start the thread with _io_thread_create. The thread receives the state |
| 340 | object as its parameter. |
| 341 | - On the worker thread: |
| 342 | 1. Do the I/O. |
| 343 | 2. Call io_thread_return. |
| 344 | 2a. If the result != 0, free the state object. |
| 345 | - is_done: |
| 346 | 1. Just call _io_thread_is_done. |
| 347 | - cancel: |
| 348 | 1. Call io_thread_cancel. |
| 349 | 1a. If the result != 0, free the state object. |
| 350 | - finish: |
| 351 | 1. Call io_thread_finish. |
| 352 | 2. Copy results out of the state object as needed. |
| 353 | 3. Free the state object...or return it to the caller. |
| 354 | |
| 355 | Incidentally, there may sometimes be asynchronous versions of blocking I/O |
| 356 | functions (struct aiocb and friends, for example); these should be |
| 357 | preferred over io_thread when performance is a concern. |
| 358 | */ |
| 359 | |
| 360 | enum _io_thread_status |
| 361 | { |
| 362 | _io_thread_working, _io_thread_done, _io_thread_cancelled |
| 363 | }; |
| 364 | |
| 365 | struct io_thread |
| 366 | { |
| 367 | #if HAVE_PTHREAD |
| 368 | /* Common misconception: "volatile" should be applied to atomic variables, |
| 369 | such as 'status', below. This is false, see |
| 370 | <http://stackoverflow.com/q/2484980>. */ |
| 371 | enum _io_thread_status status; |
| 372 | pthread_t thread; |
| 373 | #else |
| 374 | char gcc_emits_a_warning_when_the_struct_has_no_members; |
| 375 | #endif |
| 376 | }; |
| 377 | |
| 378 | #if HAVE_PTHREAD |
| 379 | |
| 380 | void *io_thread_create(struct io_thread *self, void *parent, void *(*start_routine)(void *), Display *dpy, unsigned stacksize); |
| 381 | /* |
| 382 | Create the thread, returns NULL on failure. Failure is usually due to |
| 383 | ENOMEM, or the system doesn't support threads. |
| 384 | self: The io_thread object to be initialized. |
| 385 | parent: The parameter to start_routine. The io_thread should be |
| 386 | contained within or be reachable from this. |
| 387 | start_routine: The start routine for the worker thread. |
| 388 | dpy: The X11 Display, so that '*useThreads' is honored. |
| 389 | stacksize: The stack size for the thread. Set to 0 for the system |
| 390 | default. |
| 391 | A note about stacksize: Linux, for example, uses a default of 2 MB of |
| 392 | stack per thread. Now, this memory is usually committed on the first |
| 393 | write, so lots of threads won't waste RAM, but it does mean that on a |
| 394 | 32-bit system, there's a limit of just under 1024 threads with the 2 MB |
| 395 | default due to typical address space limitations of 2 GB for userspace |
| 396 | processes. And 1024 threads might not always be enough... |
| 397 | */ |
| 398 | |
| 399 | int io_thread_return(struct io_thread *self); |
| 400 | /* Called at the end of start_routine, from above. Returns non-zero if the |
| 401 | thread has been cancelled, and cleanup needs to take place. */ |
| 402 | |
| 403 | int io_thread_is_done(struct io_thread *self); |
| 404 | /* Call from the main thread. Returns non-zero if the thread finished. */ |
| 405 | |
| 406 | int io_thread_cancel(struct io_thread *self); |
| 407 | /* Call from the main thread if the results from the worker thread are not |
| 408 | needed. This cleans up the io_thread. Returns non-zero if cleanup needs |
| 409 | to take place. */ |
| 410 | |
| 411 | void io_thread_finish(struct io_thread *self); |
| 412 | /* Call from the main thread to wait for the worker thread to finish. This |
| 413 | cleans up the io_thread. */ |
| 414 | |
| 415 | #else |
| 416 | |
| 417 | #define IO_THREAD_STACK_MIN 0 |
| 418 | |
| 419 | #define io_thread_create(self, parent, start_routine, dpy, stacksize) NULL |
| 420 | #define io_thread_return(self) 0 |
| 421 | #define io_thread_is_done(self) 1 |
| 422 | #define io_thread_cancel(self) 0 |
| 423 | #define io_thread_finish(self) |
| 424 | |
| 425 | #endif |
| 426 | |
| 427 | #if HAVE_PTHREAD |
| 428 | # define THREAD_DEFAULTS "*useThreads: True", |
| 429 | # define THREAD_DEFAULTS_XLOCK "*useThreads: True\n" |
| 430 | # define THREAD_OPTIONS \ |
| 431 | {"-threads", ".useThreads", XrmoptionNoArg, "True"}, \ |
| 432 | {"-no-threads", ".useThreads", XrmoptionNoArg, "False"}, |
| 433 | #else |
| 434 | # define THREAD_DEFAULTS |
| 435 | # define THREAD_DEFAULTS_XLOCK |
| 436 | # define THREAD_OPTIONS |
| 437 | #endif |
| 438 | |
| 439 | /* |
| 440 | If a variable 'member' is known to be a member (named 'member_name') of a |
| 441 | struct (named 'struct_name'), then this can find a pointer to the struct |
| 442 | that contains it. |
| 443 | */ |
| 444 | #define GET_PARENT_OBJ(struct_name, member_name, member) (struct_name *)((char *)member - offsetof(struct_name, member_name)); |
| 445 | |
| 446 | #endif |