sksp2024-mcu/libucw/ucw/random-fast.c

651 lines
16 KiB
C
Raw Normal View History

2024-09-14 21:50:33 +02:00
/*
* UCW Library -- Fast pseudo-random generator
*
* (c) 2020--2022 Pavel Charvat <pchar@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
*/
#undef LOCAL_DEBUG
#include <ucw/lib.h>
#include <ucw/random.h>
#include <ucw/threads.h>
#include <ucw/unaligned.h>
#include <float.h>
#include <limits.h>
#include <math.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
// Select exactly one of these pseudo-random generators:
//#define FASTRAND_USE_PCG
//#define FASTRAND_USE_SPLITMIX
#define FASTRAND_USE_XOROSHIRO
//#define FASTRAND_USE_LIBC_SVID
/**
* FASTRAND_USE_PCG
*
* References:
* -- https://en.wikipedia.org/wiki/Permuted_congruential_generator
* -- https://www.pcg-random.org/
* -- "PCG: A Family of Simple Fast Space-Efficient Statistically Good Algorithms for Random Number Generation",
* 2014, Melissa E. ONeill
*
* Notes:
* -- Simple and fast generator with good statistical properties.
* -- We use a constant increment and 64->32bit XSH RR output function.
**/
#define FASTRAND_PCG_MUL 6364136223846793005LLU
#define FASTRAND_PCG_INC 1442695040888963407LLU
/**
* FASTRAND_USE_SPLITMIX
*
* References:
* -- http://prng.di.unimi.it/splitmix64.c (public, no copyright)
* -- http://docs.oracle.com/javase/8/docs/api/java/util/SplittableRandom.html
* -- "Fast Splittable Pseudorandom Number Generators",
* 2014, Guy L. Steele Jr., Doug Lea, Christine H. Flood
*
* Notes:
* -- Each 64 bit number is generated exactly once in a cycle,
* so we only use half of it -> 32 bit output.
**/
/**
* FASTRAND_USE_XOROSHIRO
*
* References:
* -- http://prng.di.unimi.it/ (examples are public, no copyright)
* -- "Scrambled Linear Pseudorandom Number Generators",
* 2019, David Blackman, Sebastiano Vigna
*
* Notes:
* -- A family of very fast generators. Good statistical properties,
* some variants just have problems with lowest bits.
* -- We support more xo(ro)shiro variants, see the options below.
**/
// Engine: xoroshiro 128, xoshiro 256
#define FASTRAND_XOROSHIRO_SIZE 128
// Scrambler: 1 (+), 2 (++), 3 (**)
#define FASTRAND_XOROSHIRO_SCRAMBLER 1
// If defined, we only use the highest 60 bits from each 64 bit number;
// useful especially for the weakest '+' scrambler. Even if not
// we prefer higher bits where trivial.
#undef FASTRAND_XOROSHIRO_ONLY_HIGHER
/**
* FASTRAND_USE_LIBC_SVID
* -- mrand48_r()
* -- libc SVID generator
**/
struct fastrand {
#ifdef FASTRAND_USE_PCG
u64 pcg_state; // PCG generator's current state
#endif
#ifdef FASTRAND_USE_SPLITMIX
u64 splitmix_state;
#endif
#ifdef FASTRAND_USE_XOROSHIRO
u64 xoroshiro_state[FASTRAND_XOROSHIRO_SIZE / 64];
#endif
#ifdef FASTRAND_USE_LIBC_SVID
struct drand48_data svid_data; // SVID generator's state structure
#endif
bool inited; // Is seeded?
fastrand_seed_t seed_value; // Initial seed value; useful for reproducible debugging
};
/*** Common helpers ***/
// GCC should optimize these to a single instruction
static inline u32 fastrand_ror32(u32 x, uint r)
{
return (x >> r) | (x << (-r & 31));
}
static inline u64 fastrand_rol64(u64 x, uint r)
{
return (x << r) | (x >> (-r & 63));
}
/*** PCG random generator ***/
#ifdef FASTRAND_USE_PCG
#define FASTRAND_ONE_BITS 32
static inline u32 fastrand_one(struct fastrand *c)
{
u64 s = c->pcg_state;
uint r = s >> 59;
c->pcg_state = s * FASTRAND_PCG_MUL + FASTRAND_PCG_INC;
u32 x = ((s >> 18) ^ s) >> 27;
return fastrand_ror32(x, r);
}
static void fastrand_init(struct fastrand *c)
{
c->pcg_state = c->seed_value + FASTRAND_PCG_INC;
(void) fastrand_one(c);
DBG("RANDOM: Initialized PCG random generator, seed=0x%jx", (uintmax_t)c->seed_value);
}
#endif
/*** splitmix64 generator ***/
// A helper; used also for seeding xoroshiro
static inline u64 fastrand_splitmix64_next(u64 *state)
{
u64 z = (*state += 0x9e3779b97f4a7c15LLU);
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9LLU;
z = (z ^ (z >> 27)) * 0x94d049bb133111ebLLU;
return z ^ (z >> 31);
}
#ifdef FASTRAND_USE_SPLITMIX
#define FASTRAND_ONE_BITS 32
static void fastrand_init(struct fastrand *c)
{
c->splitmix_state = c->seed_value;
DBG("RANDOM: Initialized splitmix64 random generator, seed=0x%jx", (uintmax_t)c->seed_value);
}
static inline u32 fastrand_one(struct fastrand *c)
{
return fastrand_splitmix64_next(&c->splitmix_state);
}
#endif
/*** xoroshiro variants ***/
#ifdef FASTRAND_USE_XOROSHIRO
#ifdef FASTRAND_XOROSHIRO_ONLY_HIGHER
# define FASTRAND_ONE_BITS 60
#else
# define FASTRAND_ONE_BITS 64
# define FASTRAND_PREFER_HIGHER
#endif
static void fastrand_init(struct fastrand *c)
{
u64 x = c->seed_value;
for (uint i = 0; i < ARRAY_SIZE(c->xoroshiro_state); i++)
c->xoroshiro_state[i] = fastrand_splitmix64_next(&x);
if (unlikely(!c->xoroshiro_state[0]))
c->xoroshiro_state[0] = 1;
DBG("RANDOM: Initialized xoroshiro random generator, seed=0x%jx, size=%u, scrambler=%u",
(uintmax_t)c->seed_value, FASTRAND_XOROSHIRO_SIZE, FASTRAND_XOROSHIRO_SCRAMBLER);
}
static inline u64 fastrand_one(struct fastrand *c)
{
u64 result;
u64 *s = c->xoroshiro_state;
#if FASTRAND_XOROSHIRO_SIZE == 128
u64 s0 = s[0];
u64 s1 = s[1];
#if FASTRAND_XOROSHIRO_SCRAMBLER == 1
result = s0 + s1;
uint ra = 24, rb = 16, rc = 37;
#elif FASTRAND_XOROSHIRO_SCRAMBLER == 2
result = fastrand_rol64(s0 + s1, 17) + s0;
uint ra = 49, rb = 21, rc = 28;
#elif FASTRAND_XOROSHIRO_SCRAMBLER == 3
result = fastrand_rol64(s0 * 5, 7) * 9;
uint ra = 24, rb = 16, rc = 37;
#else
# error "Unsupported xoroshiro parameters"
#endif
s[0] = fastrand_rol64(s0, ra) ^ s1 ^ (s1 << rb);
s[1] = fastrand_rol64(s1, rc);
#elif FASTRAND_XOROSHIRO_SIZE == 256
#if FASTRAND_XOROSHIRO_SCRAMBLER == 1
result = s[0] + s[3];
#elif FASTRAND_XOROSHIRO_SCRAMBLER == 2
result = fastrand_rol64(s[0] + s[3], 23) + s[0];
#elif FASTRAND_XOROSHIRO_SCRAMBLER == 3
result = fastrand_rol64(s[1] * 5, 7) * 9;
#else
# error "Unsupported xoroshiro parameters"
#endif
u64 t = s[1] << 17;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = fastrand_rol64(s[3], 45);
#else
# error "Unsupported xoroshiro parameters"
#endif
#ifdef FASTRAND_XOROSHIRO_ONLY_HIGHER
result >>= 4;
#endif
return result;
}
#endif
/*** Libc SVID generator ***/
#ifdef FASTRAND_USE_LIBC_SVID
#define FASTRAND_ONE_BITS 32
static void fastrand_init(struct fastrand *c)
{
DBG("RANDOM: Initialized libc SVID random generator, seed=0x%jx", (uintmax_t)c->seed_value);
if (srand48_r(c->seed_value, &c->svid_data) < 0)
die("RANDOM: Failed to call srand48_r(): %m");
}
#ifdef LOCAL_DEBUG
static NONRET void fastrand_mrand48_r_failed(void)
{
die("RANDOM: Failed to call mrand48_r(): %m");
}
#endif
static inline u32 fastrand_one(struct fastrand *c)
{
long int r;
#ifdef LOCAL_DEBUG
if (unlikely(mrand48_r(&c->svid_data, &r) < 0))
fastrand_mrand48_r_failed();
#else
(void) mrand48_r(&c->svid_data, &r);
#endif
return r;
}
#endif
/*** Interface for creating/deleting/setting contexts ***/
struct fastrand *fastrand_new(void)
{
struct fastrand *c = xmalloc_zero(sizeof(*c));
return c;
}
void fastrand_delete(struct fastrand *c)
{
ASSERT(c);
xfree(c);
}
void fastrand_set_seed(struct fastrand *c, fastrand_seed_t seed)
{
c->seed_value = seed;
c->inited = true;
fastrand_init(c);
}
fastrand_seed_t fastrand_gen_seed_value(void)
{
// Prefer kernel's urandom if getrandom() syscall is available.
u64 val;
if (strongrand_getrandom_mem_try(&val, sizeof(val)))
return val;
// Generate a seed value as a mixture of:
// -- current time
// -- process id to deal with frequent fork() or startups
// -- counter to deal with multiple contexts, for example in threaded application
static u64 static_counter;
ucwlib_lock();
val = (static_counter += 0x1927a7639274162dLLU);
ucwlib_unlock();
val += getpid();
struct timeval tv;
if (gettimeofday(&tv, NULL) < 0)
die("RANDOM: Failed to call gettimeofday(): %m");
val += (u64)(tv.tv_sec * 1000000LL + tv.tv_usec) * 0x5a03173d83f78347LLU;
return val;
}
fastrand_seed_t fastrand_gen_seed(struct fastrand *c)
{
fastrand_seed_t seed = fastrand_gen_seed_value();
fastrand_set_seed(c, seed);
return seed;
}
bool fastrand_has_seed(struct fastrand *c)
{
return c->inited;
}
void fastrand_reset(struct fastrand *c)
{
c->inited = false;
}
#if 1
// XXX: Adds some overhead, but probably worth it
#define FASTRAND_CHECK_SEED(c) do { if (unlikely(!(c)->inited)) fastrand_check_seed_failed(); } while (0)
static NONRET void fastrand_check_seed_failed(void)
{
ASSERT(0);
}
#else
#define FASTRAND_CHECK_SEED(c) do { } while (0)
#endif
/*** Reading interface ***/
// We assume 32-64 bit output of fastrand_one().
// For 32 bits fastrand_one() can return u32 type, otherwise it must be u64.
COMPILE_ASSERT(fastrand_one_range, FASTRAND_ONE_BITS >= 32 && FASTRAND_ONE_BITS <= 64);
#if FASTRAND_ONE_BITS == 32
#define FASTRAND_ONE_MAX UINT32_MAX
#else
#define FASTRAND_ONE_MAX (UINT64_MAX >> (64 - FASTRAND_ONE_BITS))
#endif
static inline u32 fastrand_u32_helper(struct fastrand *c)
{
#if FASTRAND_ONE_BITS > 32 && defined(FASTRAND_PREFER_HIGHER)
return fastrand_one(c) >> (FASTRAND_ONE_BITS - 32);
#else
return fastrand_one(c);
#endif
}
u32 fastrand_u32(struct fastrand *c)
{
FASTRAND_CHECK_SEED(c);
return fastrand_u32_helper(c);
}
u64 fastrand_u64(struct fastrand *c)
{
FASTRAND_CHECK_SEED(c);
#if FASTRAND_ONE_BITS < 64
return ((u64)fastrand_one(c) << FASTRAND_ONE_BITS) | fastrand_one(c);
#else
return fastrand_one(c);
#endif
}
u64 fastrand_bits_u64(struct fastrand *c, uint bits)
{
#ifdef LOCAL_DEBUG
ASSERT(bits <= 64);
#endif
// Notice that we never rotate an integer by all its bits (forbidden by C standard).
FASTRAND_CHECK_SEED(c);
#if FASTRAND_ONE_BITS == 64
if (!bits)
return 0;
return fastrand_one(c) >> (FASTRAND_ONE_BITS - bits);
#else
u64 r = fastrand_one(c);
if (bits <= FASTRAND_ONE_BITS)
return r >> (FASTRAND_ONE_BITS - bits);
return (r << (bits - FASTRAND_ONE_BITS)) | (fastrand_one(c) >> (2 * FASTRAND_ONE_BITS - bits));
#endif
}
/*
* fastrand_max() and variants
*
* Algorithm 1:
*
* We can return r % bound from uniformly generated i-bit number r,
* but only if:
* r - r % bound + bound <= 2^i
* r - r % bound <= 2^i-1 - bound + 1 -- no risk of integer overflow
* r - r % bound <= (2^i-1 + 1) - bound -- can temporarily overflow
*
* Then r fits to a complete interval in the i-bit range
* and results should be uniformly distributed.
*
* If r is too high, we simply generate a different r and repeat
* the whole process. The probability is < 50% for any bound <= 2^i
* (with worst case bound == 2^i / 2 + 1), so the average number
* of iterations is < 2.
*
*
* Algorithm 2 (without any division in many cases, but needs big multiplication):
* ("Fast Random Integer Generation in an Interval", 2019, Daniel Lemier)
*
* Let bound < 2^i. Then for j in [0, bound) each interval [j * 2^i + 2^i % bound, (j + 1) * 2^i)
* contains the same number of multiples [0, 1 * bound, 2 * bound, ..., 2^i * bound),
* exactly 2^i / bound (rounded down) of them.
*
* Therefore if we have a random i-bit number r and r * bound fits to one of these intervals,
* we can immediately return (r * bound) / 2^i (index of that interval) and results should be
* correctly distributed. If we are lucky and r fits to even smaller interval
* [j * 2^i + bound, (j + 1) * 2^i], we can check that very quickly without any division.
*
* Probability of multiple iterations is again < 50%.
* Probability of division-less check is (2^i - bound) / 2^i.
*
*
* Algorithm 3 (always without divisions or multiplications, but needs more iterations):
*
* Let 2^(i-1) <= bound < 2^i. Then i-bit random number r has at least 50% probability
* to be lower than bound, when we can simply return it.
* Can be useful for very fast fastrand_one() like xoroshiro128+.
*/
#define FASTRAND_MAX_U32_ALGORITHM 2
#define FASTRAND_MAX_U64_ALGORITHM 3
static inline u32 fastrand_max_u32_helper(struct fastrand *c, u32 bound)
{
// Variant for bound <= UINT32_MAX.
// This is the most common case -> we optimize it and inline the code to both fastrand_max() and fastrand_max_u64().
#if FASTRAND_MAX_U32_ALGORITHM == 1
while (1)
{
u32 r = fastrand_u32_helper(c);
u32 x = r % bound;
if (r - x <= (u32)-bound)
return x;
#if 1
// Possible optimization for unlucky cases
u32 y = r - x;
do
r = fastrand_u32_helper(c);
while (r >= y);
return r % bound;
#endif
}
#elif FASTRAND_MAX_U32_ALGORITHM == 2
u32 r = fastrand_u32_helper(c);
u64 m = (u64)r * bound;
u32 l = (u32)m;
if (l < bound)
{
u32 t = (u32)-bound;
#if 1
// This condition can decrease the probability of division to <= 50% even in worst case and also optimize bad cases.
if (t < bound)
{
// Implies l < t % bound, so we don't need to check it before switching to completely different loop.
do
r = fastrand_u32_helper(c);
while (r >= bound);
return r;
}
#endif
t = t % bound;
while (l < t)
{
r = fastrand_u32_helper(c);
m = (u64)r * bound;
l = (u32)m;
}
}
return m >> 32;
#elif FASTRAND_MAX_U32_ALGORITHM == 3
// XXX: Probably could be optimized with __builtin_clz()
u32 r, y = bound - 1;
y |= y >> 1;
y |= y >> 2;
y |= y >> 4;
y |= y >> 8;
y |= y >> 16;
do
r = fastrand_u32_helper(c) & y;
while (r >= bound);
return r;
#else
# error Invalid FASTRAND_MAX_U32_ALGORITHM
#endif
}
static u64 fastrand_max_u64_helper(struct fastrand *c, u64 bound)
{
// Less common case -> compromise between speed and size
#if FASTRAND_MAX_U64_ALGORITHM == 1
while (1)
{
u64 r = fastrand_one(c);
u64 m = FASTRAND_ONE_MAX;
#if FASTRAND_ONE_BITS < 64
if (bound > m + 1)
{
r = (r << FASTRAND_ONE_BITS) | fastrand_one(c);
m = (m << FASTRAND_ONE_BITS) | FASTRAND_ONE_MAX;
}
#endif
u64 x = r % bound;
if (r - x <= m + 1 - bound)
return x;
}
#elif FASTRAND_MAX_U64_ALGORITHM == 3
u64 r, y = bound - 1;
y |= y >> 1;
y |= y >> 2;
y |= y >> 4;
y |= y >> 8;
y |= y >> 16;
y |= y >> 32;
do
{
r = fastrand_one(c);
#if FASTRAND_ONE_BITS < 64
if (y > FASTRAND_ONE_MAX)
r = (r << FASTRAND_ONE_BITS) | fastrand_one(c);
#endif
r &= y;
}
while (r >= bound);
return r;
#else
# error Invalid FASTRAND_MAX_U64_ALGORITHM
#endif
}
uint fastrand_max(struct fastrand *c, uint bound)
{
FASTRAND_CHECK_SEED(c);
#if UINT32_MAX < UINT_MAX
if (bound <= UINT32_MAX)
return fastrand_max_u32_helper(c, bound);
else
return fastrand_max_u64_helper(c, bound);
#else
return fastrand_max_u32_helper(c, bound);
#endif
}
u64 fastrand_max_u64(struct fastrand *c, u64 bound)
{
FASTRAND_CHECK_SEED(c);
if (bound <= UINT32_MAX)
return fastrand_max_u32_helper(c, bound);
else
return fastrand_max_u64_helper(c, bound);
}
void fastrand_mem(struct fastrand *c, void *ptr, size_t size)
{
// XXX: Could be optimized to aligned assignments instead of put_u32/64, but it
// would be quite complex to make it deterministic (for same seed and any @ptr)
// and large sizes are not used often anyway.
FASTRAND_CHECK_SEED(c);
byte *p = ptr;
#if FASTRAND_ONE_BITS < 64
for (size_t n = size >> 2; n--; p += 4)
put_u32(p, fastrand_u32_helper(c));
if (size &= 3)
{
u32 x = fastrand_u32_helper(c);
#else
for (size_t n = size >> 3; n--; p += 8)
put_u64(p, fastrand_one(c));
if (size &= 7)
{
u64 x = fastrand_one(c);
#endif
while (size--)
{
*p++ = x;
x >>= 8;
}
}
}
double fastrand_double(struct fastrand *c)
{
// Generate 52 random bits
FASTRAND_CHECK_SEED(c);
u64 r = fastrand_one(c);
#if FASTRAND_ONE_BITS >= 52
r >>= FASTRAND_ONE_BITS - 52;
#else
// Do we want to generate doubles with full 52 precision?
#if 0
r = (r << (52 - FASTRAND_ONE_BITS)) | (fastrand_one(c) >> (2 * FASTRAND_ONE_BITS - 52));
#else
r <<= 52 - FASTRAND_ONE_BITS;
#endif
#endif
// With standard IEC 60559 / IEEE 754 format, we can directly store @r as a double
// from range [1.0, 2.0) and subtract 1.0.
// But the alternative with multiplication is often faster anyway.
#if 0 && defined(__STDC_IEC_559__)
union {
u64 u;
double d;
} tmp;
ASSERT(sizeof(double) == 8 && sizeof(tmp) == 8);
tmp.u = (0x3ffLLU << 52) | r;
return tmp.d - 1.0;
#elif 1
static const double mul = 1.0 / (1LLU << 52);
return r * mul;
#else
return ldexp(r, -52);
#endif
}