git.subgeniuskitty.com - screensavers/.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* -- mode: c; tab-width: 4; fill-column: 78 -- */
	2	/* vi: set ts=4 tw=78: */
	3
	4	/*
	5	thread_util.h, Copyright (c) 2014 Dave Odell <dmo2118@gmail.com>
	6
	7	Permission to use, copy, modify, distribute, and sell this software and its
	8	documentation for any purpose is hereby granted without fee, provided that
	9	the above copyright notice appear in all copies and that both that
	10	copyright notice and this permission notice appear in supporting
	11	documentation. No representations are made about the suitability of this
	12	software for any purpose. It is provided "as is" without express or
	13	implied warranty.
	14	*/
	15
	16	#ifndef THREAD_UTIL_H
	17	#define THREAD_UTIL_H
	18
	19	/* thread_util.h because C11 took threads.h. */
	20
	21	/* And POSIX threads because there aren't too many systems that support C11
	22	threads that don't already support POSIX threads.
	23	...Not that it would be too hard to convert from the one to the other.
	24	Or to have both.
	25	*/
	26
	27	/* Beware!
	28	Multithreading is a great way to add insidious and catastrophic bugs to
	29	a program. Make sure you understand the risks.
	30
	31	You may wish to become familiar with race conditions, deadlocks, mutexes,
	32	condition variables, and, in lock-free code, memory ordering, cache
	33	hierarchies, etc., before working with threads.
	34
	35	On the other hand, if a screenhack locks up or crashes, it's not the
	36	end of the world: XScreenSaver won't unlock the screen if that happens.
	37	*/
	38
	39	/*
	40	The basic stragegy for applying threads to a CPU-hungry screenhack:
	41
	42	1. Find the CPU-hungry part of the hack.
	43
	44	2. Change that part so the workload can be divided into N equal-sized
	45	loads, where N is the number of CPU cores in the machine.
	46	(For example: with two cores, one core could render even scan lines,
	47	and the other odd scan lines.)
	48
	49	2a. Keeping in mind that two threads should not write to the same memory
	50	at the same time. Specifically, they should not be writing to the
	51	same cache line at the same time -- so align memory allocation and
	52	memory accesses to the system cache line size as necessary.
	53
	54	3. On screenhack_init, create a threadpool object. This creates N worker
	55	threads, and each thread creates and owns a user-defined struct.
	56	After creation, the threads are idle.
	57
	58	4. On screenhack_frame, call threadpool_run(). Each thread simultaneously
	59	wakes up, calls a function that does one of the equal-sized loads,
	60	then goes back to sleep. The main thread then calls threadpool_wait(),
	61	which returns once all the worker threads have finished.
	62
	63	Using this to implement SMP won't necessarily increase performance by
	64	a factor of N (again, N is CPU cores.). Both X11 and Cocoa on OS X can
	65	impose a not-insignificant amount of overhead even when simply blitting
	66	full-screen XImages @ 30 FPS.
	67
	68	On systems with simultaneous multithreading (a.k.a. Hyper-threading),
	69	performance gains may be slim to non-existant.
	70	*/
	71
	72	#include "aligned_malloc.h"
	73
	74	#if HAVE_CONFIG_H
	75	/* For HAVE_PTHREAD. */
	76	# include "config.h"
	77	#endif
	78
	79	#include <stddef.h>
	80
	81	#if HAVE_UNISTD_H
	82	/* For _POSIX_THREADS. */
	83	# include <unistd.h>
	84	#endif
	85
	86	#if defined HAVE_JWXYZ
	87	# include "jwxyz.h"
	88	#else
	89	# include <X11/Xlib.h>
	90	#endif
	91
	92	#if HAVE_PTHREAD
	93	int threads_available(Display *dpy);
	94	#else
	95	# define threads_available(dpy) (-1)
	96	#endif
	97	/* > 0: Threads are available. This is normally _POSIX_VERSION.
	98	-1: Threads are not available.
	99	*/
	100
	101	unsigned hardware_concurrency(Display *dpy);
	102	/* This is supposed to return the number of available CPU cores. This number
	103	isn't necessarily constant: a system administrator can hotplug or
	104	enable/disable CPUs on certain systems, or the system can deactivate a
	105	malfunctioning core -- but these are rare.
	106
	107	If threads are unavailable, this function will return 1.
	108
	109	This function isn't fast; the result should be cached.
	110	*/
	111
	112	unsigned thread_memory_alignment(Display *dpy);
	113
	114	/* Returns the proper alignment for memory allocated by a thread that is
	115	shared with other threads.
	116
	117	A typical CPU accesses the system RAM through a cache, and this cache is
	118	divided up into cache lines - aligned chunks of memory typically 32 or 64
	119	bytes in size. Cache faults cause cache lines to be populated from
	120	memory. And, in a multiprocessing environment, two CPU cores can access the
	121	same cache line. The consequences of this depend on the CPU model:
	122
	123	- x86 implements the MESI protocol [1] to maintain cache coherency between
	124	CPU cores, with a serious performance penalty on both Intel [1] and AMD
	125	[2]. Intel uses the term "false sharing" to describe two CPU cores
	126	accessing different memory in the same cache line.
	127
	128	- ARM allows CPU caches to become inconsistent in this case [3]. Memory
	129	fences are needed to prevent horrible non-deterministic bugs from
	130	occurring. Other CPU architectures have similar behavior to one of the
	131	above, depending on whether they are "strongly-orderered" (like x86), or
	132	"weakly-ordered" (like ARM).
	133
	134	Aligning multithreaded memory accesses according to the cache line size
	135	neatly sidesteps both issues.
	136
	137	One complication is that CPU caches are divided up into separate levels,
	138	and occasionally different levels can have different cache line sizes, so
	139	to be safe this function returns the largest cache line size among all
	140	levels.
	141
	142	If multithreading is not in effect, this returns sizeof(void *), because
	143	posix_memalign(3) will error out if the alignment is set to be smaller than
	144	that.
	145
	146	[1] Intel(R) 64 and IA-32 Architectures Optimization Reference Manual
	147	(Order Number: 248966-026): 2.1.5 Cache Hierarchy
	148	[2] Software Optimization Guide for AMD Family 10h Processors (Publication
	149	#40546): 11.3.4 Data Sharing between Caches
	150	[3] http://wanderingcoder.net/2011/04/01/arm-memory-ordering/
	151	*/
	152
	153	/*
	154	Note: aligned_malloc uses posix_memalign(3) when available, or malloc(3)
	155	otherwise. As of SUSv2 (1997), and probably earlier, these are guaranteed
	156	to be thread-safe. C89 does not discuss threads, or thread safety;
	157	non-POSIX systems, watch out!
	158	http://pubs.opengroup.org/onlinepubs/7908799/xsh/threads.html
	159	http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_09.html
	160	*/
	161
	162	/* int thread_malloc(void *ptr, Display dpy, unsigned size); */
	163	#define thread_malloc(ptr, dpy, size) \
	164	(aligned_malloc((ptr), thread_memory_alignment(dpy), (size)))
	165
	166	/*
	167	This simply does a malloc aligned to thread_memory_alignment(). See
	168	above. On failure, an errno is returned, usually ENOMEM.
	169
	170	It's possible for two malloc()'d blocks to at least partially share the
	171	same cache line. When a different thread is writing to each block, then bad
	172	things can happen (see thread_memory_alignment). Better malloc()
	173	implementations will divide memory into pools belonging to one thread or
	174	another, causing memory blocks belonging to different threads to typically
	175	be located on different memory pages (see getpagesize(2)), mitigating the
	176	problem in question...but there's nothing stopping threads from passing
	177	memory to each other. And it's not practical for the system to align each
	178	block to 64 or 128 byte boundaries -- it's not uncommon to need lots and
	179	lots of 8-32 byte allocations, and the waste could become a bit excessive.
	180
	181	Some rules of thumb to take away from this:
	182
	183	1. Use thread_alloc for memory that might be written to by a thread that
	184	didn't originally allocate the object.
	185
	186	2. Use thread_alloc for memory that will be handed from one thread to
	187	another.
	188
	189	3. Use malloc if a single thread allocates, reads from, writes to, and
	190	frees the block of memory.
	191
	192	Oddly, I (Dave) have not seen this problem described anywhere else.
	193	*/
	194
	195	#define thread_free(ptr) aligned_free(ptr)
	196
	197	#if HAVE_PTHREAD
	198	# if defined _POSIX_THREADS && _POSIX_THREADS >= 0
	199	/*
	200	See The Open Group Base Specifications Issue 7, <unistd.h>, Constants for
	201	Options and Option Groups
	202	http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/unistd.h.html#tag_13_77_03_02
	203	*/
	204
	205	# include <pthread.h>
	206
	207	/* Most PThread synchronization functions only fail when they are misused. */
	208	# if defined NDEBUG
	209	# define PTHREAD_VERIFY(expr) (void)(expr)
	210	# else
	211	# include <assert.h>
	212	# define PTHREAD_VERIFY(expr) assert(!(expr))
	213	# endif
	214
	215	extern const pthread_mutex_t mutex_initializer;
	216	extern const pthread_cond_t cond_initializer;
	217
	218	# else
	219	/* Whatever caused HAVE_PTHREAD to be defined (configure script,
	220	usually) made a mistake if this is reached. */
	221	/* Maybe this should be a warning. */
	222	# error HAVE_PTHREAD is defined, but _POSIX_THREADS is not.
	223	/* #undef HAVE_PTHREAD */
	224	# endif
	225	#endif
	226
	227	struct threadpool
	228	{
	229	/* This is always the same as the count parameter fed to threadpool_create().
	230	Here's a neat trick: if the threadpool is zeroed out with a memset, and
	231	threadpool_create() is never called to create 0 threads, then
	232	threadpool::count can be used to determine if the threadpool object was
	233	ever initialized. */
	234	unsigned count;
	235
	236	/* Copied from threadpool_class. No need for thread_create here, though. */
	237	size_t thread_size;
	238	void (thread_run)(void self);
	239	void (thread_destroy)(void self);
	240
	241	void *serial_threads;
	242
	243	#if HAVE_PTHREAD
	244	pthread_mutex_t mutex;
	245	pthread_cond_t cond;
	246
	247	/* Number of threads waiting for the startup signal. */
	248	unsigned parallel_pending;
	249
	250	/* Number of threads still running. During startup, this is the index of the thread currently being initialized. */
	251	unsigned parallel_unfinished;
	252
	253	pthread_t *parallel_threads;
	254	#endif
	255	};
	256
	257	/*
	258	The threadpool_* functions manage a group of threads (naturally). Each
	259	thread owns an object described by a threadpool_class. When
	260	threadpool_run() is called, the specified func parameter is called on each
	261	thread in parallel. Sometime after calling threadpool_run(), call
	262	threadpool_wait(), which waits for each thread to return from
	263	threadpool_class::run().
	264
	265	Note that thread 0 runs on the thread from which threadpool_run is called
	266	from, so if each thread has an equal workload, then when threadpool_run
	267	returns, the other threads will be finished or almost finished. Adding code
	268	between threadpool_run and threadpool_wait increases the odds that
	269	threadpool_wait won't actually have to wait at all -- which is nice.
	270
	271	If the system does not provide threads, then these functions will fake it:
	272	everything will appear to work normally from the perspective of the caller,
	273	but when threadpool_run() is called, the "threads" are run synchronously;
	274	threadpool_wait() does nothing.
	275	*/
	276
	277	struct threadpool_class
	278	{
	279	/* Size of the thread private object. */
	280	size_t size;
	281
	282	/* Create the thread private object. Called in sequence for each thread
	283	(effectively) from threadpool_create.
	284	self: A pointer to size bytes of memory, allocated to hold the thread
	285	object.
	286	pool: The threadpool object that owns all the threads. If the threadpool
	287	is nested in another struct, try GET_PARENT_OBJ.
	288	id: The ID for the thread; numbering starts at zero and goes up by one
	289	for each thread.
	290	Return 0 on success. On failure, return a value from errno.h; this will
	291	be returned from threadpool_create. */
	292	int (create)(void self, struct threadpool *pool, unsigned id);
	293
	294	/* Destroys the thread private object. Called in sequence (though not always
	295	the same sequence as create). Warning: During shutdown, it is possible
	296	for destroy() to be called while other threads are still in
	297	threadpool_run(). */
	298	void (destroy)(void self);
	299	};
	300
	301	/* Returns 0 on success, on failure can return ENOMEM, or any error code from
	302	threadpool_class.create. */
	303	int threadpool_create(struct threadpool self, const struct threadpool_class cls, Display *dpy, unsigned count);
	304	void threadpool_destroy(struct threadpool *self);
	305
	306	void threadpool_run(struct threadpool self, void (func)(void *));
	307	void threadpool_wait(struct threadpool *self);
	308
	309	/*
	310	io_thread is meant to wrap blocking I/O operations in a one-shot worker
	311	thread, with cancel semantics.
	312
	313	Unlike threadpool_*, io_thread will not 'fake it'; it is up to the caller
	314	to figure out what to do if the system doesn't have threads. In
	315	particular, the start_routine passed to io_thread_create will never be
	316	called.
	317
	318	Clients of io_thread implement four functions:
	319	- state *process_start(...);
	320	Starts the worker thread.
	321	- bool process_is_done(state *);
	322	Returns true if the I/O operation is complete.
	323	- void process_cancel(state *);
	324	"Cancels" the I/O operation. The thread will continue to run, but it
	325	will detach, and clean itself up upon completion.
	326	- int process_finish(state *, ...)
	327	Waits for the I/O operation to complete, returns results, and cleans up.
	328
	329	Or: /---\
	330	\/ \| /--> cancel
	331	start -> is_done --+
	332	\--> finish
	333
	334	These functions follow a basic pattern:
	335	- start:
	336	1. Allocate a thread state object with thread_alloc. This state object
	337	contains an io_thread member.
	338	2. Save parameters from the start parameters to the state object.
	339	3. Start the thread with _io_thread_create. The thread receives the state
	340	object as its parameter.
	341	- On the worker thread:
	342	1. Do the I/O.
	343	2. Call io_thread_return.
	344	2a. If the result != 0, free the state object.
	345	- is_done:
	346	1. Just call _io_thread_is_done.
	347	- cancel:
	348	1. Call io_thread_cancel.
	349	1a. If the result != 0, free the state object.
	350	- finish:
	351	1. Call io_thread_finish.
	352	2. Copy results out of the state object as needed.
	353	3. Free the state object...or return it to the caller.
	354
	355	Incidentally, there may sometimes be asynchronous versions of blocking I/O
	356	functions (struct aiocb and friends, for example); these should be
	357	preferred over io_thread when performance is a concern.
	358	*/
	359
	360	enum _io_thread_status
	361	{
	362	_io_thread_working, _io_thread_done, _io_thread_cancelled
	363	};
	364
	365	struct io_thread
	366	{
	367	#if HAVE_PTHREAD
	368	/* Common misconception: "volatile" should be applied to atomic variables,
	369	such as 'status', below. This is false, see
	370	<http://stackoverflow.com/q/2484980>. */
	371	enum _io_thread_status status;
	372	pthread_t thread;
	373	#else
	374	char gcc_emits_a_warning_when_the_struct_has_no_members;
	375	#endif
	376	};
	377
	378	#if HAVE_PTHREAD
	379
	380	void io_thread_create(struct io_thread self, void parent, void (start_routine)(void ), Display *dpy, unsigned stacksize);
	381	/*
	382	Create the thread, returns NULL on failure. Failure is usually due to
	383	ENOMEM, or the system doesn't support threads.
	384	self: The io_thread object to be initialized.
	385	parent: The parameter to start_routine. The io_thread should be
	386	contained within or be reachable from this.
	387	start_routine: The start routine for the worker thread.
	388	dpy: The X11 Display, so that '*useThreads' is honored.
	389	stacksize: The stack size for the thread. Set to 0 for the system
	390	default.
	391	A note about stacksize: Linux, for example, uses a default of 2 MB of
	392	stack per thread. Now, this memory is usually committed on the first
	393	write, so lots of threads won't waste RAM, but it does mean that on a
	394	32-bit system, there's a limit of just under 1024 threads with the 2 MB
	395	default due to typical address space limitations of 2 GB for userspace
	396	processes. And 1024 threads might not always be enough...
	397	*/
	398
	399	int io_thread_return(struct io_thread *self);
	400	/* Called at the end of start_routine, from above. Returns non-zero if the
	401	thread has been cancelled, and cleanup needs to take place. */
	402
	403	int io_thread_is_done(struct io_thread *self);
	404	/* Call from the main thread. Returns non-zero if the thread finished. */
	405
	406	int io_thread_cancel(struct io_thread *self);
	407	/* Call from the main thread if the results from the worker thread are not
	408	needed. This cleans up the io_thread. Returns non-zero if cleanup needs
	409	to take place. */
	410
	411	void io_thread_finish(struct io_thread *self);
	412	/* Call from the main thread to wait for the worker thread to finish. This
	413	cleans up the io_thread. */
	414
	415	#else
	416
	417	#define IO_THREAD_STACK_MIN 0
	418
	419	#define io_thread_create(self, parent, start_routine, dpy, stacksize) NULL
	420	#define io_thread_return(self) 0
	421	#define io_thread_is_done(self) 1
	422	#define io_thread_cancel(self) 0
	423	#define io_thread_finish(self)
	424
	425	#endif
	426
	427	#if HAVE_PTHREAD
	428	# define THREAD_DEFAULTS "*useThreads: True",
	429	# define THREAD_DEFAULTS_XLOCK "*useThreads: True\n"
	430	# define THREAD_OPTIONS \
	431	{"-threads", ".useThreads", XrmoptionNoArg, "True"}, \
	432	{"-no-threads", ".useThreads", XrmoptionNoArg, "False"},
	433	#else
	434	# define THREAD_DEFAULTS
	435	# define THREAD_DEFAULTS_XLOCK
	436	# define THREAD_OPTIONS
	437	#endif
	438
	439	/*
	440	If a variable 'member' is known to be a member (named 'member_name') of a
	441	struct (named 'struct_name'), then this can find a pointer to the struct
	442	that contains it.
	443	*/
	444	#define GET_PARENT_OBJ(struct_name, member_name, member) (struct_name )((char )member - offsetof(struct_name, member_name));
	445
	446	#endif