/* * UCW Library -- Fast pseudo-random generator * * (c) 2020--2022 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. */ #undef LOCAL_DEBUG #include #include #include #include #include #include #include #include #include #include // Select exactly one of these pseudo-random generators: //#define FASTRAND_USE_PCG //#define FASTRAND_USE_SPLITMIX #define FASTRAND_USE_XOROSHIRO //#define FASTRAND_USE_LIBC_SVID /** * FASTRAND_USE_PCG * * References: * -- https://en.wikipedia.org/wiki/Permuted_congruential_generator * -- https://www.pcg-random.org/ * -- "PCG: A Family of Simple Fast Space-Efficient Statistically Good Algorithms for Random Number Generation", * 2014, Melissa E. O’Neill * * Notes: * -- Simple and fast generator with good statistical properties. * -- We use a constant increment and 64->32bit XSH RR output function. **/ #define FASTRAND_PCG_MUL 6364136223846793005LLU #define FASTRAND_PCG_INC 1442695040888963407LLU /** * FASTRAND_USE_SPLITMIX * * References: * -- http://prng.di.unimi.it/splitmix64.c (public, no copyright) * -- http://docs.oracle.com/javase/8/docs/api/java/util/SplittableRandom.html * -- "Fast Splittable Pseudorandom Number Generators", * 2014, Guy L. Steele Jr., Doug Lea, Christine H. Flood * * Notes: * -- Each 64 bit number is generated exactly once in a cycle, * so we only use half of it -> 32 bit output. **/ /** * FASTRAND_USE_XOROSHIRO * * References: * -- http://prng.di.unimi.it/ (examples are public, no copyright) * -- "Scrambled Linear Pseudorandom Number Generators", * 2019, David Blackman, Sebastiano Vigna * * Notes: * -- A family of very fast generators. Good statistical properties, * some variants just have problems with lowest bits. * -- We support more xo(ro)shiro variants, see the options below. **/ // Engine: xoroshiro 128, xoshiro 256 #define FASTRAND_XOROSHIRO_SIZE 128 // Scrambler: 1 (+), 2 (++), 3 (**) #define FASTRAND_XOROSHIRO_SCRAMBLER 1 // If defined, we only use the highest 60 bits from each 64 bit number; // useful especially for the weakest '+' scrambler. Even if not // we prefer higher bits where trivial. #undef FASTRAND_XOROSHIRO_ONLY_HIGHER /** * FASTRAND_USE_LIBC_SVID * -- mrand48_r() * -- libc SVID generator **/ struct fastrand { #ifdef FASTRAND_USE_PCG u64 pcg_state; // PCG generator's current state #endif #ifdef FASTRAND_USE_SPLITMIX u64 splitmix_state; #endif #ifdef FASTRAND_USE_XOROSHIRO u64 xoroshiro_state[FASTRAND_XOROSHIRO_SIZE / 64]; #endif #ifdef FASTRAND_USE_LIBC_SVID struct drand48_data svid_data; // SVID generator's state structure #endif bool inited; // Is seeded? fastrand_seed_t seed_value; // Initial seed value; useful for reproducible debugging }; /*** Common helpers ***/ // GCC should optimize these to a single instruction static inline u32 fastrand_ror32(u32 x, uint r) { return (x >> r) | (x << (-r & 31)); } static inline u64 fastrand_rol64(u64 x, uint r) { return (x << r) | (x >> (-r & 63)); } /*** PCG random generator ***/ #ifdef FASTRAND_USE_PCG #define FASTRAND_ONE_BITS 32 static inline u32 fastrand_one(struct fastrand *c) { u64 s = c->pcg_state; uint r = s >> 59; c->pcg_state = s * FASTRAND_PCG_MUL + FASTRAND_PCG_INC; u32 x = ((s >> 18) ^ s) >> 27; return fastrand_ror32(x, r); } static void fastrand_init(struct fastrand *c) { c->pcg_state = c->seed_value + FASTRAND_PCG_INC; (void) fastrand_one(c); DBG("RANDOM: Initialized PCG random generator, seed=0x%jx", (uintmax_t)c->seed_value); } #endif /*** splitmix64 generator ***/ // A helper; used also for seeding xoroshiro static inline u64 fastrand_splitmix64_next(u64 *state) { u64 z = (*state += 0x9e3779b97f4a7c15LLU); z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9LLU; z = (z ^ (z >> 27)) * 0x94d049bb133111ebLLU; return z ^ (z >> 31); } #ifdef FASTRAND_USE_SPLITMIX #define FASTRAND_ONE_BITS 32 static void fastrand_init(struct fastrand *c) { c->splitmix_state = c->seed_value; DBG("RANDOM: Initialized splitmix64 random generator, seed=0x%jx", (uintmax_t)c->seed_value); } static inline u32 fastrand_one(struct fastrand *c) { return fastrand_splitmix64_next(&c->splitmix_state); } #endif /*** xoroshiro variants ***/ #ifdef FASTRAND_USE_XOROSHIRO #ifdef FASTRAND_XOROSHIRO_ONLY_HIGHER # define FASTRAND_ONE_BITS 60 #else # define FASTRAND_ONE_BITS 64 # define FASTRAND_PREFER_HIGHER #endif static void fastrand_init(struct fastrand *c) { u64 x = c->seed_value; for (uint i = 0; i < ARRAY_SIZE(c->xoroshiro_state); i++) c->xoroshiro_state[i] = fastrand_splitmix64_next(&x); if (unlikely(!c->xoroshiro_state[0])) c->xoroshiro_state[0] = 1; DBG("RANDOM: Initialized xoroshiro random generator, seed=0x%jx, size=%u, scrambler=%u", (uintmax_t)c->seed_value, FASTRAND_XOROSHIRO_SIZE, FASTRAND_XOROSHIRO_SCRAMBLER); } static inline u64 fastrand_one(struct fastrand *c) { u64 result; u64 *s = c->xoroshiro_state; #if FASTRAND_XOROSHIRO_SIZE == 128 u64 s0 = s[0]; u64 s1 = s[1]; #if FASTRAND_XOROSHIRO_SCRAMBLER == 1 result = s0 + s1; uint ra = 24, rb = 16, rc = 37; #elif FASTRAND_XOROSHIRO_SCRAMBLER == 2 result = fastrand_rol64(s0 + s1, 17) + s0; uint ra = 49, rb = 21, rc = 28; #elif FASTRAND_XOROSHIRO_SCRAMBLER == 3 result = fastrand_rol64(s0 * 5, 7) * 9; uint ra = 24, rb = 16, rc = 37; #else # error "Unsupported xoroshiro parameters" #endif s[0] = fastrand_rol64(s0, ra) ^ s1 ^ (s1 << rb); s[1] = fastrand_rol64(s1, rc); #elif FASTRAND_XOROSHIRO_SIZE == 256 #if FASTRAND_XOROSHIRO_SCRAMBLER == 1 result = s[0] + s[3]; #elif FASTRAND_XOROSHIRO_SCRAMBLER == 2 result = fastrand_rol64(s[0] + s[3], 23) + s[0]; #elif FASTRAND_XOROSHIRO_SCRAMBLER == 3 result = fastrand_rol64(s[1] * 5, 7) * 9; #else # error "Unsupported xoroshiro parameters" #endif u64 t = s[1] << 17; s[2] ^= s[0]; s[3] ^= s[1]; s[1] ^= s[2]; s[0] ^= s[3]; s[2] ^= t; s[3] = fastrand_rol64(s[3], 45); #else # error "Unsupported xoroshiro parameters" #endif #ifdef FASTRAND_XOROSHIRO_ONLY_HIGHER result >>= 4; #endif return result; } #endif /*** Libc SVID generator ***/ #ifdef FASTRAND_USE_LIBC_SVID #define FASTRAND_ONE_BITS 32 static void fastrand_init(struct fastrand *c) { DBG("RANDOM: Initialized libc SVID random generator, seed=0x%jx", (uintmax_t)c->seed_value); if (srand48_r(c->seed_value, &c->svid_data) < 0) die("RANDOM: Failed to call srand48_r(): %m"); } #ifdef LOCAL_DEBUG static NONRET void fastrand_mrand48_r_failed(void) { die("RANDOM: Failed to call mrand48_r(): %m"); } #endif static inline u32 fastrand_one(struct fastrand *c) { long int r; #ifdef LOCAL_DEBUG if (unlikely(mrand48_r(&c->svid_data, &r) < 0)) fastrand_mrand48_r_failed(); #else (void) mrand48_r(&c->svid_data, &r); #endif return r; } #endif /*** Interface for creating/deleting/setting contexts ***/ struct fastrand *fastrand_new(void) { struct fastrand *c = xmalloc_zero(sizeof(*c)); return c; } void fastrand_delete(struct fastrand *c) { ASSERT(c); xfree(c); } void fastrand_set_seed(struct fastrand *c, fastrand_seed_t seed) { c->seed_value = seed; c->inited = true; fastrand_init(c); } fastrand_seed_t fastrand_gen_seed_value(void) { // Prefer kernel's urandom if getrandom() syscall is available. u64 val; if (strongrand_getrandom_mem_try(&val, sizeof(val))) return val; // Generate a seed value as a mixture of: // -- current time // -- process id to deal with frequent fork() or startups // -- counter to deal with multiple contexts, for example in threaded application static u64 static_counter; ucwlib_lock(); val = (static_counter += 0x1927a7639274162dLLU); ucwlib_unlock(); val += getpid(); struct timeval tv; if (gettimeofday(&tv, NULL) < 0) die("RANDOM: Failed to call gettimeofday(): %m"); val += (u64)(tv.tv_sec * 1000000LL + tv.tv_usec) * 0x5a03173d83f78347LLU; return val; } fastrand_seed_t fastrand_gen_seed(struct fastrand *c) { fastrand_seed_t seed = fastrand_gen_seed_value(); fastrand_set_seed(c, seed); return seed; } bool fastrand_has_seed(struct fastrand *c) { return c->inited; } void fastrand_reset(struct fastrand *c) { c->inited = false; } #if 1 // XXX: Adds some overhead, but probably worth it #define FASTRAND_CHECK_SEED(c) do { if (unlikely(!(c)->inited)) fastrand_check_seed_failed(); } while (0) static NONRET void fastrand_check_seed_failed(void) { ASSERT(0); } #else #define FASTRAND_CHECK_SEED(c) do { } while (0) #endif /*** Reading interface ***/ // We assume 32-64 bit output of fastrand_one(). // For 32 bits fastrand_one() can return u32 type, otherwise it must be u64. COMPILE_ASSERT(fastrand_one_range, FASTRAND_ONE_BITS >= 32 && FASTRAND_ONE_BITS <= 64); #if FASTRAND_ONE_BITS == 32 #define FASTRAND_ONE_MAX UINT32_MAX #else #define FASTRAND_ONE_MAX (UINT64_MAX >> (64 - FASTRAND_ONE_BITS)) #endif static inline u32 fastrand_u32_helper(struct fastrand *c) { #if FASTRAND_ONE_BITS > 32 && defined(FASTRAND_PREFER_HIGHER) return fastrand_one(c) >> (FASTRAND_ONE_BITS - 32); #else return fastrand_one(c); #endif } u32 fastrand_u32(struct fastrand *c) { FASTRAND_CHECK_SEED(c); return fastrand_u32_helper(c); } u64 fastrand_u64(struct fastrand *c) { FASTRAND_CHECK_SEED(c); #if FASTRAND_ONE_BITS < 64 return ((u64)fastrand_one(c) << FASTRAND_ONE_BITS) | fastrand_one(c); #else return fastrand_one(c); #endif } u64 fastrand_bits_u64(struct fastrand *c, uint bits) { #ifdef LOCAL_DEBUG ASSERT(bits <= 64); #endif // Notice that we never rotate an integer by all its bits (forbidden by C standard). FASTRAND_CHECK_SEED(c); #if FASTRAND_ONE_BITS == 64 if (!bits) return 0; return fastrand_one(c) >> (FASTRAND_ONE_BITS - bits); #else u64 r = fastrand_one(c); if (bits <= FASTRAND_ONE_BITS) return r >> (FASTRAND_ONE_BITS - bits); return (r << (bits - FASTRAND_ONE_BITS)) | (fastrand_one(c) >> (2 * FASTRAND_ONE_BITS - bits)); #endif } /* * fastrand_max() and variants * * Algorithm 1: * * We can return r % bound from uniformly generated i-bit number r, * but only if: * r - r % bound + bound <= 2^i * r - r % bound <= 2^i-1 - bound + 1 -- no risk of integer overflow * r - r % bound <= (2^i-1 + 1) - bound -- can temporarily overflow * * Then r fits to a complete interval in the i-bit range * and results should be uniformly distributed. * * If r is too high, we simply generate a different r and repeat * the whole process. The probability is < 50% for any bound <= 2^i * (with worst case bound == 2^i / 2 + 1), so the average number * of iterations is < 2. * * * Algorithm 2 (without any division in many cases, but needs big multiplication): * ("Fast Random Integer Generation in an Interval", 2019, Daniel Lemier) * * Let bound < 2^i. Then for j in [0, bound) each interval [j * 2^i + 2^i % bound, (j + 1) * 2^i) * contains the same number of multiples [0, 1 * bound, 2 * bound, ..., 2^i * bound), * exactly 2^i / bound (rounded down) of them. * * Therefore if we have a random i-bit number r and r * bound fits to one of these intervals, * we can immediately return (r * bound) / 2^i (index of that interval) and results should be * correctly distributed. If we are lucky and r fits to even smaller interval * [j * 2^i + bound, (j + 1) * 2^i], we can check that very quickly without any division. * * Probability of multiple iterations is again < 50%. * Probability of division-less check is (2^i - bound) / 2^i. * * * Algorithm 3 (always without divisions or multiplications, but needs more iterations): * * Let 2^(i-1) <= bound < 2^i. Then i-bit random number r has at least 50% probability * to be lower than bound, when we can simply return it. * Can be useful for very fast fastrand_one() like xoroshiro128+. */ #define FASTRAND_MAX_U32_ALGORITHM 2 #define FASTRAND_MAX_U64_ALGORITHM 3 static inline u32 fastrand_max_u32_helper(struct fastrand *c, u32 bound) { // Variant for bound <= UINT32_MAX. // This is the most common case -> we optimize it and inline the code to both fastrand_max() and fastrand_max_u64(). #if FASTRAND_MAX_U32_ALGORITHM == 1 while (1) { u32 r = fastrand_u32_helper(c); u32 x = r % bound; if (r - x <= (u32)-bound) return x; #if 1 // Possible optimization for unlucky cases u32 y = r - x; do r = fastrand_u32_helper(c); while (r >= y); return r % bound; #endif } #elif FASTRAND_MAX_U32_ALGORITHM == 2 u32 r = fastrand_u32_helper(c); u64 m = (u64)r * bound; u32 l = (u32)m; if (l < bound) { u32 t = (u32)-bound; #if 1 // This condition can decrease the probability of division to <= 50% even in worst case and also optimize bad cases. if (t < bound) { // Implies l < t % bound, so we don't need to check it before switching to completely different loop. do r = fastrand_u32_helper(c); while (r >= bound); return r; } #endif t = t % bound; while (l < t) { r = fastrand_u32_helper(c); m = (u64)r * bound; l = (u32)m; } } return m >> 32; #elif FASTRAND_MAX_U32_ALGORITHM == 3 // XXX: Probably could be optimized with __builtin_clz() u32 r, y = bound - 1; y |= y >> 1; y |= y >> 2; y |= y >> 4; y |= y >> 8; y |= y >> 16; do r = fastrand_u32_helper(c) & y; while (r >= bound); return r; #else # error Invalid FASTRAND_MAX_U32_ALGORITHM #endif } static u64 fastrand_max_u64_helper(struct fastrand *c, u64 bound) { // Less common case -> compromise between speed and size #if FASTRAND_MAX_U64_ALGORITHM == 1 while (1) { u64 r = fastrand_one(c); u64 m = FASTRAND_ONE_MAX; #if FASTRAND_ONE_BITS < 64 if (bound > m + 1) { r = (r << FASTRAND_ONE_BITS) | fastrand_one(c); m = (m << FASTRAND_ONE_BITS) | FASTRAND_ONE_MAX; } #endif u64 x = r % bound; if (r - x <= m + 1 - bound) return x; } #elif FASTRAND_MAX_U64_ALGORITHM == 3 u64 r, y = bound - 1; y |= y >> 1; y |= y >> 2; y |= y >> 4; y |= y >> 8; y |= y >> 16; y |= y >> 32; do { r = fastrand_one(c); #if FASTRAND_ONE_BITS < 64 if (y > FASTRAND_ONE_MAX) r = (r << FASTRAND_ONE_BITS) | fastrand_one(c); #endif r &= y; } while (r >= bound); return r; #else # error Invalid FASTRAND_MAX_U64_ALGORITHM #endif } uint fastrand_max(struct fastrand *c, uint bound) { FASTRAND_CHECK_SEED(c); #if UINT32_MAX < UINT_MAX if (bound <= UINT32_MAX) return fastrand_max_u32_helper(c, bound); else return fastrand_max_u64_helper(c, bound); #else return fastrand_max_u32_helper(c, bound); #endif } u64 fastrand_max_u64(struct fastrand *c, u64 bound) { FASTRAND_CHECK_SEED(c); if (bound <= UINT32_MAX) return fastrand_max_u32_helper(c, bound); else return fastrand_max_u64_helper(c, bound); } void fastrand_mem(struct fastrand *c, void *ptr, size_t size) { // XXX: Could be optimized to aligned assignments instead of put_u32/64, but it // would be quite complex to make it deterministic (for same seed and any @ptr) // and large sizes are not used often anyway. FASTRAND_CHECK_SEED(c); byte *p = ptr; #if FASTRAND_ONE_BITS < 64 for (size_t n = size >> 2; n--; p += 4) put_u32(p, fastrand_u32_helper(c)); if (size &= 3) { u32 x = fastrand_u32_helper(c); #else for (size_t n = size >> 3; n--; p += 8) put_u64(p, fastrand_one(c)); if (size &= 7) { u64 x = fastrand_one(c); #endif while (size--) { *p++ = x; x >>= 8; } } } double fastrand_double(struct fastrand *c) { // Generate 52 random bits FASTRAND_CHECK_SEED(c); u64 r = fastrand_one(c); #if FASTRAND_ONE_BITS >= 52 r >>= FASTRAND_ONE_BITS - 52; #else // Do we want to generate doubles with full 52 precision? #if 0 r = (r << (52 - FASTRAND_ONE_BITS)) | (fastrand_one(c) >> (2 * FASTRAND_ONE_BITS - 52)); #else r <<= 52 - FASTRAND_ONE_BITS; #endif #endif // With standard IEC 60559 / IEEE 754 format, we can directly store @r as a double // from range [1.0, 2.0) and subtract 1.0. // But the alternative with multiplication is often faster anyway. #if 0 && defined(__STDC_IEC_559__) union { u64 u; double d; } tmp; ASSERT(sizeof(double) == 8 && sizeof(tmp) == 8); tmp.u = (0x3ffLLU << 52) | r; return tmp.d - 1.0; #elif 1 static const double mul = 1.0 / (1LLU << 52); return r * mul; #else return ldexp(r, -52); #endif }