+/*
+ * Synchronize with another thread's call to init_once_signal on the same
+ * HANDLE object.
+ */
+static void init_once_wait(HANDLE *ep)
+{
+#if _WIN64
+ if (!InterlockedAdd64((LONG64 *)ep, 0)) // load with memory barrier
+#else
+ if (!InterlockedAdd((LONG *)ep, 0))
+#endif
+ {
+ HANDLE e;
+
+ e = CreateEvent(NULL, FALSE, FALSE, NULL);
+ if (!InterlockedCompareExchangePointer(ep, e, NULL))
+ WaitForSingleObject(e, INFINITE);
+ CloseHandle(e);
+ }
+}
+
+/*
+ * This implementation using an MCS lock variation requires only a single
+ * pointer of shared global state initialized to null, and in the uncontended
+ * case does not require allocation of any Windows resources whatsoever.
+ *
+ * These locks are described in the paper:
+ *
+ * "Algorithms for Scalable Synchronization on Shared-Memory Multiprocessors"
+ * by John M. Mellor-Crummey and Michael L. Scott.
+ * ACM Transactions on Computer Systems Volume 9, Issue 1 (Feb. 1991).
+ *
+ * The basic idea is that each thread has a local state, which includes a
+ * pointer to the next waiting thread. The global state is a tail pointer to
+ * the last waiting thread. The running thread holds the lock and also the
+ * pointer to the first waiter.
+ *
+ * On acquire, atomically swap our fresh local state with the global tail
+ * pointer, becoming the new last waiter. We receive a pointer to the previous
+ * last waiter (or nothing, in the unlocked case). At this point it is safe
+ * for a new thread to come along and update the tail pointer again. If
+ * needed, we then update the last waiter to point to our thread, signal
+ * that this is completed, and then wait to be signaled.
+ *
+ * On release, if the tail pointer points to us there are no waiters, and this
+ * can be confirmed with an atomic compare and exchange to the done state,
+ * which is equivalent to the original state except that a subsequent acquirer
+ * will know that the initialization has been previously completed.
+ *
+ * If that didn't unlock the lock, we need to wait for the signal from the next
+ * thread (which may not have updated our next pointer yet), then signal the
+ * next thread to wake up. Eventually the queue will empty and the lock is
+ * left in the done state, at which point a simple atomic load can determine
+ * that nothing else needs to happen.
+ */