/*
    is parametrized: yes
*/

#if defined(GUF_RAND_IMPL_STATIC)
    #define GUF_RAND_KWRDS static
#else
    #define GUF_RAND_KWRDS
#endif

#ifndef GUF_RAND_H
#define GUF_RAND_H
#include "guf_common.h"

/*
    - guf_rand32 functions use the xoshiro128** 1.1 generator [1] (rng-state of 4 32-bit integers, i.e. 128 bits)
    - guf_rand64 functions use the xoshiro256** 1.0 generator [2] (rng-state of 4 64-bit integers, i.e. 256 bits)
    - guf_rand functions use either guf_rand32 or guf_rand64 depending on GUF_RAND_32_BIT (which is set globally in guf_common.h depending on the platform's word-size)

     [1] xoshiro128** 1.1 (public domain) written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)
         cf. https://prng.di.unimi.it/xoshiro128starstar.c (last-retrieved 2025-02-11)

     [2] xoshiro256** 1.0 (public domain) written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)
         cf. https://prng.di.unimi.it/xoshiro256starstar.c (last-retrieved 2025-02-11)
*/

// State for xoshiro128** 1.1 
typedef struct guf_rand32_state {
    uint32_t s[4]; // Must not be all zero.
} guf_rand32_state;

// State for xoshiro256** 1.0
#ifdef UINT64_MAX
    typedef struct guf_rand64_state {
        uint64_t s[4]; // Must not be all zero. 
    } guf_rand64_state;
#endif

#ifdef GUF_RAND_32_BIT
    // Use guf_rand32_state (i.e. xoshiro128** 1.1) as default.
    #define GUF_RAND_MAX UINT32_MAX 
    typedef guf_rand32_state guf_randstate;
    typedef uint32_t guf_rand_seed_t;
#else
    // Use guf_rand64_state (i.e. xoshiro256** 1.0) as default.
    #ifndef UINT64_MAX
        #error "guf_rand.h: Platform does not support uint64_t (define GUF_RAND_32_BIT to fix)"
    #endif
    #define GUF_RAND_MAX UINT64_MAX
    typedef guf_rand64_state guf_randstate; 
    typedef uint64_t guf_rand_seed_t;
#endif

/*
    - guf_randstate_init(state, seed) -> void
         Initialise the rng-state from a single 64-bit (or 32-bit) seed. 
         The seed is scrambled by guf_rand_splitmix64/32 internally. 
         Non-permissible states, i.e. states where all four state-integers turn out to be zero,
         will be automatically corrected, which means all seeds passed to guf_randstate_init are permissible. 
         (If you want to initialise the guf_randstate struct manually, you have to ensure yourself the four state-integers aren't all zero.)
*/
GUF_RAND_KWRDS void guf_randstate_init(guf_randstate *state, guf_rand_seed_t seed);
#ifdef UINT64_MAX
    GUF_RAND_KWRDS void guf_rand64_state_init(guf_rand64_state *state, uint64_t seed);
#endif
GUF_RAND_KWRDS void guf_rand32_state_init(guf_rand32_state *state, uint32_t seed);

/*
    - guf_randstate_jump(state) 
      -> void; advance the rng-state as if 2^128 (or 2^64 for rand32_state) calls to guf_rand_u32 had occured. 
         Can be used to generate 2^128 (or 2^64 for rand32_state) non-overlapping subsequences for parallel computations.
*/
GUF_RAND_KWRDS void guf_randstate_jump(guf_randstate *state); // Equivalent to 2^128 (or 2^64) calls to guf_rand_u32
#ifdef UINT64_MAX
    GUF_RAND_KWRDS void guf_rand64_state_jump(guf_rand64_state *state); // Equivalent to 2^128 calls to guf_rand64_u64
#endif
GUF_RAND_KWRDS void guf_rand32_state_jump(guf_rand32_state *state); // Equivalent to 2^64 calls to guf_rand32_u32


// Uniform distributions in default ranges:

/*
    - guf_rand_splitmix64(state) -> uint64_t in range [0, UINT64_MAX]
      (Very simple rng with only 64-bits of state; used for "scrambling" 64-bit seeds in guf_randstate_init.)
    - guf_rand_splitmix32(state) -> uint32_t in range [0, UINT32_MAX]
      (Very simple rng with only 32-bits of state; used for "scrambling" 32-bit seeds in guf_randstate_init.)
*/
#ifdef UINT64_MAX
    GUF_RAND_KWRDS uint64_t guf_rand_splitmix64(uint64_t *state);
#endif
GUF_RAND_KWRDS uint32_t guf_rand_splitmix32(uint32_t *state);

/*
    - guf_rand_u32(state) -> uint32_t in range [0, UINT32_MAX]
*/
GUF_RAND_KWRDS uint32_t guf_rand_u32(guf_randstate *state); 
#ifdef UINT64_MAX
    GUF_RAND_KWRDS uint32_t guf_rand64_u32(guf_rand64_state *state);
#endif
GUF_RAND_KWRDS uint32_t guf_rand32_u32(guf_rand32_state *state);

/*
    - guf_rand_u64(state) -> uint64_t (or uint_least64_t) in range [0, UINT64_MAX]
      NOTE: May be slow on 32-bit platforms. 
      NOTE: If uint64_t is not available (optional according to the standards), use uint_least64_t (always available in C99 and above).
*/
#ifdef UINT64_MAX
    GUF_RAND_KWRDS uint64_t guf_rand_u64(guf_randstate *state);
    GUF_RAND_KWRDS uint64_t guf_rand32_u64(guf_rand32_state *state); 
    GUF_RAND_KWRDS uint64_t guf_rand64_u64(guf_rand64_state *state);
#else
    GUF_RAND_KWRDS uint_least64_t guf_rand_u64(guf_randstate *state)      
    GUF_RAND_KWRDS uint_least64_t guf_rand32_u64(guf_rand32_state *state);
#endif

/*
    - guf_rand_f64(state) -> double in range [0.0, 1.0)
      NOTE: May be slow on 32-bit platforms (as it calls guf_rand_u64)
*/
GUF_RAND_KWRDS double guf_rand_f64(guf_randstate *state); 
#ifdef UINT64_MAX
    GUF_RAND_KWRDS double guf_rand64_f64(guf_rand64_state *state); 
#endif
GUF_RAND_KWRDS double guf_rand32_f64(guf_rand32_state *state);

/*
    - guf_rand_f32(state) -> float in range [0.f, 1.f)
*/
GUF_RAND_KWRDS float guf_rand_f32(guf_randstate *state);
#ifdef UINT64_MAX
    GUF_RAND_KWRDS float guf_rand64_f32(guf_rand64_state *state); 
#endif
GUF_RAND_KWRDS float guf_rand32_f32(guf_rand32_state *state);     


// Uniform distributions in custom ranges:

/*
    - guf_randrange_f32(state, min, end) -> float in range  [min, end) (contrary to the integer equivalents, end is *not* inclusive)
    - guf_randrange_f64(state, min, end) -> double in range [min, end) (contrary to the integer equivalents, end is *not* inclusive)
      NOTE: f64 versions may be slow on 32-bit platforms.
*/
GUF_RAND_KWRDS float guf_randrange_f32(guf_randstate *state, float min, float end);
GUF_RAND_KWRDS double guf_randrange_f64(guf_randstate *state, double min, double end);
#ifdef UINT64_MAX
    GUF_RAND_KWRDS float guf_rand64_range_f32(guf_rand64_state *state, float min, float end); 
    GUF_RAND_KWRDS double guf_rand64_range_f64(guf_rand64_state *state, double min, double end);
#endif
GUF_RAND_KWRDS float guf_rand32_range_f32(guf_rand32_state *state, float min, float end);
GUF_RAND_KWRDS double guf_rand32_range_f64(guf_rand32_state *state, double min, double end);

/*
    - guf_randrange_i32(state, min, max) -> int32_t in range  [min, max] (contrary to the float equivalents, max *is* inclusive)
    - guf_randrange_u32(state, min, max) -> uint32_t in range [min, max] (contrary to the float equivalents, max *is* inclusive)
      NOTE: guf_randrange_u32 may be slow on 32-bit platforms (as it calls guf_rand_f64). 
            This does not apply to guf_randrange_i32 (as it doesn't call guf_rand_f64).
*/
GUF_RAND_KWRDS int32_t guf_randrange_i32(guf_randstate *state, int32_t min, int32_t max); 
GUF_RAND_KWRDS uint32_t guf_randrange_u32(guf_randstate *state, uint32_t min, uint32_t max); // NOTE: may be slow on 32-bit platforms.
#ifdef UINT64_MAX
    GUF_RAND_KWRDS int32_t guf_rand64_range_i32(guf_rand64_state *state, int32_t min, int32_t max); 
    GUF_RAND_KWRDS uint32_t guf_rand64_range_u32(guf_rand64_state *state, uint32_t min, uint32_t max);
#endif
GUF_RAND_KWRDS int32_t guf_rand32_range_i32(guf_rand32_state *state, int32_t min, int32_t max);   
GUF_RAND_KWRDS uint32_t guf_rand32_range_u32(guf_rand32_state *state, uint32_t min, uint32_t max); // NOTE: may be slow on 32-bit platforms.

/*
    - guf_randrange_i64(state, min, max) -> int64_t in range [min, max] (contrary to the float equivalents, max *is* inclusive)
      NOTE: The Generic version is only available if GUF_RAND_32_BIT is undefined and the platform supports uint64_t. 
            (The specific guf_rand64_range_i64 version is available as long as the platform supports uint64_t.)
*/
#if defined(UINT64_MAX) && !defined(GUF_RAND_32_BIT)
    GUF_RAND_KWRDS int64_t guf_randrange_i64(guf_randstate *state, int64_t min, int64_t max);
#endif
#ifdef UINT64_MAX
    GUF_RAND_KWRDS int64_t guf_rand64_range_i64(guf_rand64_state *state, int64_t min, int64_t max);
#endif

// Bernoulli-trials: 

/*
    - guf_rand_bernoulli_trial(state, p) -> return true with a probability of p, false with a probability of (1 - p)
      NOTE: p will be clamped to be in range [0.0, 1.0]
      NOTE: The f64 versions may be slow on 32-bit platforms. 
    - guf_rand_flip(state) -> return true with a probability of 50 %, i.e. fair coin flip (bernoulli trial with p == 0.5)
*/
GUF_RAND_KWRDS bool guf_rand_bernoulli_trial_f32(guf_randstate *state, float p);
GUF_RAND_KWRDS bool guf_rand_bernoulli_trial_f64(guf_randstate *state, double p); 
GUF_RAND_KWRDS bool guf_rand_flip(guf_randstate *state);
#ifdef UINT64_MAX
    GUF_RAND_KWRDS bool guf_rand64_bernoulli_trial_f32(guf_rand64_state *state, float p);
    GUF_RAND_KWRDS bool guf_rand64_bernoulli_trial_f64(guf_rand64_state *state, double p);
    GUF_RAND_KWRDS bool guf_rand64_flip(guf_rand64_state *state);
#endif
GUF_RAND_KWRDS bool guf_rand32_bernoulli_trial_f32(guf_rand32_state *state, float p);
GUF_RAND_KWRDS bool guf_rand32_bernoulli_trial_f64(guf_rand32_state *state, double p);
GUF_RAND_KWRDS bool guf_rand32_flip(guf_rand32_state *state); 

// Normal distributions: 

/*
    - guf_rand_normal_sample_f32/f64(state, mean, std_dev, result, n) 
      -> void; put n float/double samples following the given normal-distribution into result (result is allocated by the caller and must have enough space to hold at least n samples)
    - guf_rand_normal_sample_one_f32/f64(state, mean, std_dev) 
      -> return one float/double sample following the given normal-distribution
    - NOTE: the f64 versions may be slow on 32-bit platforms. 
*/
GUF_RAND_KWRDS void guf_rand_normal_sample_f64(guf_randstate *state, double mean, double std_dev, double *result, ptrdiff_t n);
GUF_RAND_KWRDS void guf_rand_normal_sample_f32(guf_randstate *state, float mean, float std_dev, float *result, ptrdiff_t n);
GUF_RAND_KWRDS double guf_rand_normal_sample_one_f64(guf_randstate *state, double mean, double std_dev);
GUF_RAND_KWRDS float guf_rand_normal_sample_one_f32(guf_randstate *state, float mean, float std_dev);
#ifdef UINT64_MAX
    GUF_RAND_KWRDS void guf_rand64_normal_sample_f32(guf_rand64_state *state, float mean, float std_dev, float *result, ptrdiff_t n);
    GUF_RAND_KWRDS void guf_rand64_normal_sample_f64(guf_rand64_state *state, double mean, double std_dev, double *result, ptrdiff_t n);
    GUF_RAND_KWRDS float guf_rand64_normal_sample_one_f32(guf_rand64_state *state, float mean, float std_dev);
    GUF_RAND_KWRDS double guf_rand64_normal_sample_one_f64(guf_rand64_state *state, double mean, double std_dev);
#endif
GUF_RAND_KWRDS void guf_rand32_normal_sample_f32(guf_rand32_state *state, float mean, float std_dev, float *result, ptrdiff_t n);
GUF_RAND_KWRDS void guf_rand32_normal_sample_f64(guf_rand32_state *state, double mean, double std_dev, double *result, ptrdiff_t n);
GUF_RAND_KWRDS float guf_rand32_normal_sample_one_f32(guf_rand32_state *state, float mean, float std_dev);
GUF_RAND_KWRDS double guf_rand32_normal_sample_one_f64(guf_rand32_state *state, double mean, double std_dev);

#endif

#if defined(GUF_RAND_IMPL) || defined(GUF_RAND_IMPL_STATIC) 
#include <math.h>
#include <float.h>
#include "guf_common.h"
#include "guf_assert.h"
#include "guf_math.h"

#ifdef UINT64_MAX
    /* 
        splitmix64 written in 2015 by Sebastiano Vigna (vigna@acm.org) (released as public domain)
        cf. https://prng.di.unimi.it/splitmix64.c (last-retrieved 2025-02-11)
    */
    GUF_RAND_KWRDS uint64_t guf_rand_splitmix64(uint64_t *state) 
    {
        GUF_ASSERT(state);
        uint64_t z = ((*state) += 0x9e3779b97f4a7c15);
        z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
        z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
        return z ^ (z >> 31);
    }
#endif

/*
    splitmix32 written in 2016 by Kaito Udagawa (released under CC0 <http://creativecommons.org/publicdomain/zero/1.0/>)
    cf. https://github.com/umireon/my-random-stuff/blob/master/xorshift/splitmix32.c (last-retrieved 2025-03-28)
*/
GUF_RAND_KWRDS uint32_t guf_rand_splitmix32(uint32_t *state)
{
    GUF_ASSERT(state);
    uint32_t z = (*state += 0x9e3779b9);
    z = (z ^ (z >> 16)) * 0x85ebca6b;
    z = (z ^ (z >> 13)) * 0xc2b2ae35;
    return z ^ (z >> 16);
}

GUF_RAND_KWRDS void guf_rand32_state_init(guf_rand32_state *state, uint32_t seed)
{
    for (size_t i = 0; i < GUF_ARR_SIZE(state->s); ++i) {
        state->s[i] = guf_rand_splitmix32(&seed);
    }
    if (!state->s[0] && !state->s[1] && !state->s[2] && !state->s[3]) { // State must not be only zeroes:
        state->s[0] = 0x9e3779b9; // arbitrary constant != 0
        seed = state->s[0];
        for (size_t i = 1; i < GUF_ARR_SIZE(state->s); ++i) {
            state->s[i] = guf_rand_splitmix32(&seed);
        }
    }
}

#ifdef UINT64_MAX
    GUF_RAND_KWRDS void guf_rand64_state_init(guf_rand64_state *state, uint64_t seed)
    {
        for (size_t i = 0; i < GUF_ARR_SIZE(state->s); ++i) {
            state->s[i] = guf_rand_splitmix64(&seed);
        }
        if (!state->s[0] && !state->s[1] && !state->s[2] && !state->s[3]) { // State must not be only zeroes:
            state->s[0] = 0x9e3779b97f4a7c15; // arbitrary constant != 0
            seed = state->s[0];
            for (size_t i = 1; i < GUF_ARR_SIZE(state->s); ++i) {
                state->s[i] = guf_rand_splitmix64(&seed);
            }
        }
    }
#endif

GUF_RAND_KWRDS void guf_randstate_init(guf_randstate *state, guf_rand_seed_t seed)
{
    #ifdef GUF_RAND_32_BIT
        guf_rand32_state_init(state, seed);
    #else
        guf_rand64_state_init(state, seed);
    #endif
}
  
GUF_RAND_KWRDS uint32_t guf_rand32_u32(guf_rand32_state *state)
{
    GUF_ASSERT(state);
    GUF_ASSERT(state->s[0] || state->s[1] || state->s[2] || state->s[3]);
    /*
        xoshiro128** 1.1 (public domain) written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)
        cf. https://prng.di.unimi.it/xoshiro128starstar.c (last-retrieved 2025-02-11)
    */
	const uint32_t result = guf_rotl_u32(state->s[1] * 5, 7) * 9;
	const uint32_t t = state->s[1] << 9;
	state->s[2] ^= state->s[0];
	state->s[3] ^= state->s[1];
	state->s[1] ^= state->s[2];
	state->s[0] ^= state->s[3];
	state->s[2] ^= t;
	state->s[3] = guf_rotl_u32(state->s[3], 11);
	return result;
}

#ifdef UINT64_MAX
    GUF_RAND_KWRDS uint32_t guf_rand64_u32(guf_rand64_state *state)
    {
        return (uint32_t)(guf_rand64_u64(state) >> 32);
    }
#endif

GUF_RAND_KWRDS uint32_t guf_rand_u32(guf_randstate *state)
{
    GUF_ASSERT(state);
    GUF_ASSERT(state->s[0] || state->s[1] || state->s[2] || state->s[3]);
    #ifdef GUF_RAND_32_BIT
        return guf_rand32_u32(state);
    #else
        return guf_rand64_u32(state);
    #endif
}


#ifdef UINT64_MAX
    GUF_RAND_KWRDS uint64_t guf_rand64_u64(guf_rand64_state *state)
    {
        GUF_ASSERT(state);
        GUF_ASSERT(state->s[0] || state->s[1] || state->s[2] || state->s[3]);
        /*
            xoshiro256** 1.0 (public domain) written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)
            cf. https://prng.di.unimi.it/xoshiro256starstar.c (last-retrieved 2025-02-11)
        */
        const uint64_t result = guf_rotl_u64(state->s[1] * 5, 7) * 9;
        const uint64_t t = state->s[1] << 17;
        state->s[2] ^= state->s[0];
        state->s[3] ^= state->s[1];
        state->s[1] ^= state->s[2];
        state->s[0] ^= state->s[3];
        state->s[2] ^= t;
        state->s[3] = guf_rotl_u64(state->s[3], 45);
        return result;
    }
#endif

#ifdef UINT64_MAX
GUF_RAND_KWRDS uint64_t guf_rand32_u64(guf_rand32_state *state)
#else
GUF_RAND_KWRDS uint_least64_t guf_rand32_u64(guf_rand32_state *state)
#endif
{
    GUF_ASSERT(state);
    GUF_ASSERT(state->s[0] || state->s[1] || state->s[2] || state->s[3]);
    const uint32_t lower_bits = guf_rand32_u32(state);
    const uint32_t upper_bits = guf_rand32_u32(state);
    #ifdef UINT64_MAX
       return ((uint64_t)upper_bits << 32) | (uint64_t)lower_bits; // TODO: not sure if that's a good idea...
    #else
        return ((uint_least64_t)upper_bits << 32) | (uint_least64_t)lower_bits; // TODO: not sure if that's a good idea...
    #endif
}


#ifdef UINT64_MAX
GUF_RAND_KWRDS uint64_t guf_rand_u64(guf_randstate *state)
#else
GUF_RAND_KWRDS uint_least64_t guf_rand_u64(guf_randstate *state)
#endif
{
    #ifdef GUF_RAND_32_BIT
        return guf_rand32_u64(state);
    #else
        return guf_rand64_u64(state);
    #endif
}

/*
    Equivalent to 2^64 calls to guf_rand32_u32; can be used to generate 2^64
    non-overlapping subsequences for parallel computations.
*/
GUF_RAND_KWRDS void guf_rand32_state_jump(guf_rand32_state *state) 
{
    GUF_ASSERT(state);
    static const uint32_t JUMP[] = { 0x8764000b, 0xf542d2d3, 0x6fa035c3, 0x77f2db5b };
	uint32_t s0 = 0;
	uint32_t s1 = 0;
	uint32_t s2 = 0;
	uint32_t s3 = 0;
	for (size_t i = 0; i < sizeof JUMP / sizeof *JUMP; ++i) {
	    for (int b = 0; b < 32; ++b) {
		    if (JUMP[i] & UINT32_C(1) << b) {
			    s0 ^= state->s[0];
				s1 ^= state->s[1];
				s2 ^= state->s[2];
				s3 ^= state->s[3];
			}
			guf_rand32_u32(state);	
		}
    }
	state->s[0] = s0;
	state->s[1] = s1;
	state->s[2] = s2;
	state->s[3] = s3;
}


#ifdef UINT64_MAX
    /*  
        Equivalent to 2^128 calls to guf_rand64_u64(); can be used to generate 2^128
        non-overlapping subsequences for parallel computations. 
    */
    GUF_RAND_KWRDS void guf_rand64_state_jump(guf_rand64_state *state) 
    {
        static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c };
        uint64_t s0 = 0;
        uint64_t s1 = 0;
        uint64_t s2 = 0;
        uint64_t s3 = 0;
        for (size_t i = 0; i < sizeof JUMP / sizeof *JUMP; ++i) {
            for (int b = 0; b < 64; ++b) {
                if (JUMP[i] & UINT64_C(1) << b) {
                    s0 ^= state->s[0];
                    s1 ^= state->s[1];
                    s2 ^= state->s[2];
                    s3 ^= state->s[3];
                }
                guf_rand64_u64(state);
            }
        }
        state->s[0] = s0;
        state->s[1] = s1;
        state->s[2] = s2;
        state->s[3] = s3;
    }
#endif

/*  
   Equivalent to 2^128 calls to guf_rand() (or 2^64 calls if GUF_RAND_32_BIT); it can be used to generate 2^128 (or 2^64)
   non-overlapping subsequences for parallel computations. 
*/
GUF_RAND_KWRDS void guf_randstate_jump(guf_randstate *state) 
{
    GUF_ASSERT(state);
    #ifdef GUF_RAND_32_BIT
        guf_rand32_state_jump(state);
    #else
        guf_rand64_state_jump(state);
    #endif
}

#ifdef UINT64_MAX
    GUF_RAND_KWRDS double guf_rand64_f64(guf_rand64_state *state)
    {
        // cf. https://prng.di.unimi.it/ and https://dotat.at/@/2023-06-23-random-double.html (last-retrieved 2025-02-11)
        return (guf_rand64_u64(state) >> 11) * 0x1.0p-53; // 11 == 64 - 53 (double has a 53-bit mantissa/significand)
    }
#endif

GUF_RAND_KWRDS double guf_rand32_f64(guf_rand32_state *state)
{
    // cf. https://prng.di.unimi.it/ and https://dotat.at/@/2023-06-23-random-double.html (last-retrieved 2025-02-11)
    return (guf_rand32_u64(state) >> 11) * 0x1.0p-53; // 11 == 64 - 53 (double has a 53-bit mantissa/significand)
}

// Generate double in the unit interval [0, 1)
GUF_RAND_KWRDS double guf_rand_f64(guf_randstate *state)
{
    // cf. https://prng.di.unimi.it/ and https://dotat.at/@/2023-06-23-random-double.html (last-retrieved 2025-02-11)
    return (guf_rand_u64(state) >> 11) * 0x1.0p-53; // 11 == 64 - 53 (double has a 53-bit mantissa/significand)
}

#ifdef UINT64_MAX
    GUF_RAND_KWRDS float guf_rand64_f32(guf_rand64_state *state)
    {
        return (guf_rand64_u64(state) >> 40) * 0x1.0p-24f; // 40 == 64 - 24; (float has a 24-bit mantissa/significand)
    }
#endif

GUF_RAND_KWRDS float guf_rand32_f32(guf_rand32_state *state)
{
    return (guf_rand32_u32(state) >> 8) * 0x1.0p-24f;  // 8 == 32 - 24; (float has a 24-bit mantissa/significand)
}

// Generate float in the unit interval [0, 1)
GUF_RAND_KWRDS float guf_rand_f32(guf_randstate *state)
{
    #ifdef GUF_RAND_32_BIT
        return guf_rand32_f32(state);
    #else
        return guf_rand64_f32(state);
    #endif
}


GUF_RAND_KWRDS bool guf_rand32_bernoulli_trial_f32(guf_rand32_state *state, float p)
{
    p = guf_clamp_f32(p, 0, 1);
    return guf_rand32_f32(state) < p; // never true for p = 0, always true for p = 1 since guf_rand_f64 is in range [0, 1)
}
#ifdef UINT64_MAX
    GUF_RAND_KWRDS bool guf_rand64_bernoulli_trial_f32(guf_rand64_state *state, float p)
    {
        p = guf_clamp_f32(p, 0, 1);
        return guf_rand64_f32(state) < p; // never true for p = 0, always true for p = 1 since guf_rand_f64 is in range [0, 1)
    }
#endif

GUF_RAND_KWRDS bool guf_rand_bernoulli_trial_f32(guf_randstate *state, float p)
{
    p = guf_clamp_f32(p, 0, 1);
    return guf_rand_f32(state) < p; // never true for p = 0, always true for p = 1 since guf_rand_f64 is in range [0, 1)
}

GUF_RAND_KWRDS bool guf_rand32_bernoulli_trial_f64(guf_rand32_state *state, double p)
{
    p = guf_clamp_f64(p, 0, 1);
    return guf_rand32_f64(state) < p; // never true for p = 0, always true for p = 1 since guf_rand_f64 is in range [0, 1)
}

#ifdef UINT64_MAX
    GUF_RAND_KWRDS bool guf_rand64_bernoulli_trial_f64(guf_rand64_state *state, double p)
    {
        p = guf_clamp_f64(p, 0, 1);
        return guf_rand64_f64(state) < p; // never true for p = 0, always true for p = 1 since guf_rand_f64 is in range [0, 1)
    }
#endif

GUF_RAND_KWRDS bool guf_rand_bernoulli_trial_f64(guf_randstate *state, double p)
{
    p = guf_clamp_f64(p, 0, 1);
    return guf_rand_f64(state) < p; // never true for p = 0, always true for p = 1 since guf_rand_f64 is in range [0, 1)
}

GUF_RAND_KWRDS bool guf_rand32_flip(guf_rand32_state *state)
{
    return guf_rand32_bernoulli_trial_f32(state, 0.5f);
}

#ifdef UINT64_MAX
    GUF_RAND_KWRDS bool guf_rand64_flip(guf_rand64_state *state)
    {
        return guf_rand64_bernoulli_trial_f64(state, 0.5);
    }
#endif

GUF_RAND_KWRDS bool guf_rand_flip(guf_randstate *state)
{
    #ifdef GUF_RAND_32_BIT
        return guf_rand_bernoulli_trial_f32(state, 0.5f);
    #else
        return guf_rand_bernoulli_trial_f64(state, 0.5);
    #endif
}

GUF_RAND_KWRDS double guf_rand32_range_f64(guf_rand32_state *state, double min, double end)
{
    if (min == (double)INFINITY) {
        min = DBL_MAX;
    } else if (min == (double)-INFINITY) {
        min = -DBL_MAX;
    }
    if (end == (double)INFINITY) {
        end = DBL_MAX;
    } else if (end == (double)-INFINITY) {
        end = -DBL_MAX;
    }
    GUF_ASSERT_RELEASE(end >= min);
    return guf_rand32_f64(state) * (end - min) + min;
}

#ifdef UINT64_MAX
    GUF_RAND_KWRDS double guf_rand64_range_f64(guf_rand64_state *state, double min, double end)
    {
        if (min == (double)INFINITY) {
            min = DBL_MAX;
        } else if (min == (double)-INFINITY) {
            min = -DBL_MAX;
        }
        if (end == (double)INFINITY) {
            end = DBL_MAX;
        } else if (end == (double)-INFINITY) {
            end = -DBL_MAX;
        }
        GUF_ASSERT_RELEASE(end >= min);
        return guf_rand64_f64(state) * (end - min) + min;
    }
#endif

// returns uniformly-distributed random double in range [min, end) (or min if min == end)
GUF_RAND_KWRDS double guf_randrange_f64(guf_randstate *state, double min, double end)
{
    #ifdef GUF_RAND_32_BIT
        return guf_rand32_range_f64(state, min, end);
    #else
        return guf_rand64_range_f64(state, min, end);
    #endif
}

// returns uniformly-distributed random float in range [min, end) (or min if min == end)
GUF_RAND_KWRDS float guf_rand32_range_f32(guf_rand32_state *state, float min, float end)
{
    if (min == INFINITY) {
        min = FLT_MAX;
    } else if (min == -INFINITY) {
        min = -FLT_MAX;
    }
    if (end == INFINITY) {
        end = FLT_MAX;
    } else if (end == -INFINITY) {
        end = -FLT_MAX;
    }
    GUF_ASSERT_RELEASE(end >= min);
    return guf_rand32_f32(state) * (end - min) + min;
}

#ifdef UINT64_MAX
    // returns uniformly-distributed random float in range [min, end) (or min if min == end)
    GUF_RAND_KWRDS float guf_rand64_range_f32(guf_rand64_state *state, float min, float end)
    {
        if (min == INFINITY) {
            min = FLT_MAX;
        } else if (min == -INFINITY) {
            min = -FLT_MAX;
        }
        if (end == INFINITY) {
            end = FLT_MAX;
        } else if (end == -INFINITY) {
            end = -FLT_MAX;
        }
        GUF_ASSERT_RELEASE(end >= min);
        return guf_rand64_f32(state) * (end - min) + min;
    }
#endif

// returns uniformly-distributed random float in range [min, end) (or min if min == end)
GUF_RAND_KWRDS float guf_randrange_f32(guf_randstate *state, float min, float end)
{
    #ifdef GUF_RAND_32_BIT
        return guf_rand32_range_f32(state, min, end);
    #else
        return guf_rand64_range_f32(state, min, end);
    #endif
}


#ifdef UINT64_MAX
    // returns uniformly-distributed random int32_t in range [min, max] (max is inclusive as opposed to the f32/f64 versions)
    GUF_RAND_KWRDS int32_t guf_rand64_range_i32(guf_rand64_state *state, int32_t min, int32_t max)
    {
        GUF_ASSERT_RELEASE(max >= min);
        if (min == max) {
            return min;
        }
        const double delta = (double)max - (double)min; 
        // cf. https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Math/random (last-retrieved 2025-02-12)
        const double result = floor(guf_rand64_f64(state) * (delta + 1.0) + min);
        GUF_ASSERT(result >= min && result <= max);
        return (int32_t)result;
    }
#endif

// returns uniformly-distributed random int32_t in range [min, max] (max is inclusive as opposed to the f32/f64 versions)
GUF_RAND_KWRDS int32_t guf_rand32_range_i32(guf_rand32_state *state, int32_t min, int32_t max)
{
    GUF_ASSERT_RELEASE(max >= min);
    if (min == max) {
        return min;
    }

    const uint32_t rand_max_i32 = UINT32_MAX >> 1; // 2^31 - 1 (== INT32_MAX)
    
    const uint32_t delta = guf_absdiff_i32(max, min);
    if (delta > rand_max_i32) {
        guf_panic(GUF_ERR_INT_OVERFLOW, GUF_ERR_MSG("in function guf_rand32_range_i32: interval [min, max] larger than INT32_MAX"));
        return -1;
    }
    /* 
        We don't use the same approach as in guf_rand64_range_i32 because a 32-bit float only has a 24-bit mantissa
        (and using double like in guf_rand64_range_i32 would require 64-bit arithmetic due to guf_rand32_f64).

        cf. https://c-faq.com/lib/randrange.html (last-retrieved 2025-02-11)
            https://stackoverflow.com/a/6852396  (last-retrieved 2025-02-11)
    */
    const uint32_t num_rand_vals = rand_max_i32 + 1u; // 2^31
    const uint32_t num_bins = (delta + 1u); 

    const uint32_t bin_size = num_rand_vals / num_bins; // bin_size = floor(num_rand_vals / num_bins)
    const uint32_t limit = num_rand_vals - (num_rand_vals % num_bins); // limit == bin_size * num_bins
    GUF_ASSERT(limit == bin_size * num_bins);
    /*
       since (num_rand_vals % num_bins) is at most 2^30 + 1 (I think...), the minimum limit is 2^31 - (2^30 + 1), 
       which means in the worst case, the chance of having to iterate (i.e. step >= limit) 
       is 1 - (2^31 - (2^30 + 1)) / 2^31 == 0.5
    */ 
   uint32_t step; 
    do {
        step = guf_rand32_u32(state) >> 1; // [0, 2^31 - 1]
    } while (step >= limit); 
    step = step / bin_size;

    GUF_ASSERT(!guf_add_is_overflow_i32(min, step));
    const int32_t rnd = min + (int32_t)step;
    GUF_ASSERT(rnd >= min && rnd <= max);
    return rnd;
}

// returns uniformly-distributed random int32_t in range [min, max] (max is inclusive as opposed to the f32/f64 versions)
GUF_RAND_KWRDS int32_t guf_randrange_i32(guf_randstate *state, int32_t min, int32_t max)
{
    #ifdef GUF_RAND_32_BIT
        return guf_rand32_range_i32(state, min, max);
    #else
        return guf_rand64_range_i32(state, min, max);
    #endif
}

GUF_RAND_KWRDS uint32_t guf_rand32_range_u32(guf_rand32_state *state, uint32_t min, uint32_t max)
{
    /*
        The method used in guf_rand32_range_i32 above (which uses only 32-bit integer arithmetic) could overflow here,
        so we use the floating-point method since we have to use 64-bit arithmetic anyways.
    */

    GUF_ASSERT_RELEASE(max >= min);
    if (min == max) {
        return min;
    }
    const double delta = (double)max - (double)min;
    const double result = floor(guf_rand32_f64(state) * (delta + 1.0) + min); // NOTE: guf_rand32_f64 is slow for 32-bit platforms...
    GUF_ASSERT(result >= min && result <= max);
    return (uint32_t)result;
}

#ifdef UINT64_MAX
    GUF_RAND_KWRDS uint32_t guf_rand64_range_u32(guf_rand64_state *state, uint32_t min, uint32_t max)
    {
        GUF_ASSERT_RELEASE(max >= min);
        if (min == max) {
            return min;
        }
        const double delta = (double)max - (double)min;
        const double result = floor(guf_rand64_f64(state) * (delta + 1.0) + min);
        GUF_ASSERT(result >= min && result <= max);
        return (uint32_t)result;
    }
#endif

GUF_RAND_KWRDS uint32_t guf_randrange_u32(guf_randstate *state, uint32_t min, uint32_t max)
{
    #ifdef GUF_RAND_32_BIT
        return guf_rand32_range_u32(state, min, max);
    #else
        return guf_rand64_range_u32(state, min, max);
    #endif
}

#ifdef UINT64_MAX
    GUF_RAND_KWRDS int64_t guf_rand64_range_i64(guf_rand64_state *state, int64_t min, int64_t max)
    {
        GUF_ASSERT_RELEASE(max >= min);
        if (min == max) {
            return min;
        }

        const uint64_t rand_max_i64 = UINT64_MAX >> 1; // 2^63 - 1 (== INT64_MAX)
        
        const uint64_t delta = guf_absdiff_i64(max, min);
        if (delta > rand_max_i64) {
            guf_panic(GUF_ERR_INT_OVERFLOW, GUF_ERR_MSG("in function guf_randrange_i64: interval [min, max] larger than INT64_MAX"));
            return -1;
        }
        /* 
            We should not use the same approach as in guf_rand64_range_i32 because (max - min) might be close to 2^63 - 1

            cf. https://c-faq.com/lib/randrange.html (last-retrieved 2025-02-11)
                https://stackoverflow.com/a/6852396  (last-retrieved 2025-02-11)
        */
        const uint64_t num_rand_vals = rand_max_i64 + 1u; // 2^63
        const uint64_t num_bins = (delta + 1u); 

        const uint64_t bin_size = num_rand_vals / num_bins; // bin_size = floor(num_rand_vals / num_bins)
        const uint64_t limit = num_rand_vals - (num_rand_vals % num_bins); // limit == bin_size * num_bins
        GUF_ASSERT(limit == bin_size * num_bins);
        /*
        since (num_rand_vals % num_bins) is at most 2^62 + 1 (I think...), the minimum limit is 2^63 - (2^62 + 1), 
        which means in the worst case, the chance of having to iterate (i.e. step >= limit) 
        is 1 - (2^63 - (2^62 + 1)) / 2^63 == 0.5
        */ 
        uint64_t step; 
        do {
            step = guf_rand64_u64(state) >> 1; // [0, 2^63 - 1]
        } while (step >= limit); 
        step = step / bin_size;

        GUF_ASSERT(!guf_add_is_overflow_i64(min, step));
        const int64_t rnd = min + (int64_t)step;
        GUF_ASSERT(rnd >= min && rnd <= max);
        return rnd;
    }
#endif

#if !defined(GUF_RAND_32_BIT) && defined(UINT64_MAX)
    // returns uniformly-distributed random int64_t in range [min, max] (max is inclusive as opposed to the f32/f64 versions)
    GUF_RAND_KWRDS int64_t guf_randrange_i64(guf_randstate *state, int64_t min, int64_t max)
    {
        return guf_rand64_range_i64(state, min, max);
    }
#endif

// Box-Müller-transform transcribed from wikipedia, cf. https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform (last-retrieved 2025-02-12)
GUF_RAND_KWRDS void guf_rand32_normal_sample_f64(guf_rand32_state *state, double mean, double std_dev, double *result, ptrdiff_t n)
{
    GUF_ASSERT_RELEASE(result);
    GUF_ASSERT_RELEASE(n >= 0);
    const double TAU = 2.0 * GUF_PI;

    ptrdiff_t i = 0; 
    while (i < n) {
        double u1, u2; 
        do {
            u1 = guf_rand32_f64(state);
        } while (u1 == 0); 
        u2 = guf_rand32_f64(state);

        const double mag = std_dev * sqrt(-2.0 * log(u1));
        result[i++] = mag * cos(TAU * u2) + mean;
        if (i < n) {
            result[i++] = mag * sin(TAU * u2) + mean;
        }  
    }
}

#ifdef UINT64_MAX
    // Box-Müller-transform transcribed from wikipedia, cf. https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform (last-retrieved 2025-02-12)
    GUF_RAND_KWRDS void guf_rand64_normal_sample_f64(guf_rand64_state *state, double mean, double std_dev, double *result, ptrdiff_t n)
    {
        GUF_ASSERT_RELEASE(result);
        GUF_ASSERT_RELEASE(n >= 0);
        const double TAU = 2.0 * GUF_PI;

        ptrdiff_t i = 0; 
        while (i < n) {
            double u1, u2; 
            do {
                u1 = guf_rand64_f64(state);
            } while (u1 == 0); 
            u2 = guf_rand64_f64(state);

            const double mag = std_dev * sqrt(-2.0 * log(u1));
            result[i++] = mag * cos(TAU * u2) + mean;
            if (i < n) {
                result[i++] = mag * sin(TAU * u2) + mean;
            }  
        }
    }
#endif

// Box-Müller-transform transcribed from wikipedia, cf. https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform (last-retrieved 2025-02-12)
GUF_RAND_KWRDS void guf_rand_normal_sample_f64(guf_randstate *state, double mean, double std_dev, double *result, ptrdiff_t n)
{
    #ifdef GUF_RAND_32_BIT
        guf_rand32_normal_sample_f64(state, mean, std_dev, result, n);
    #else
        guf_rand64_normal_sample_f64(state, mean, std_dev, result, n);
    #endif
}

GUF_RAND_KWRDS void guf_rand32_normal_sample_f32(guf_rand32_state *state, float mean, float std_dev, float *result, ptrdiff_t n)
{
    GUF_ASSERT_RELEASE(result);
    GUF_ASSERT_RELEASE(n >= 0);
    const float TAU = 2.f * (float)GUF_PI;

    ptrdiff_t i = 0; 
    while (i < n) {
        float u1, u2; 
        do {
            u1 = guf_rand32_f32(state);
        } while (u1 == 0); 
        u2 = guf_rand32_f32(state);

        const float mag = std_dev * sqrtf(-2.f * logf(u1));
        result[i++] = mag * cosf(TAU * u2) + mean;
        if (i < n) {
            result[i++] = mag * sinf(TAU * u2) + mean;
        }  
    }
}

#ifdef UINT64_MAX
    GUF_RAND_KWRDS void guf_rand64_normal_sample_f32(guf_rand64_state *state, float mean, float std_dev, float *result, ptrdiff_t n)
    {
        GUF_ASSERT_RELEASE(result);
        GUF_ASSERT_RELEASE(n >= 0);
        const float TAU = 2.f * (float)GUF_PI;

        ptrdiff_t i = 0; 
        while (i < n) {
            float u1, u2; 
            do {
                u1 = guf_rand64_f32(state);
            } while (u1 == 0); 
            u2 = guf_rand64_f32(state);

            const float mag = std_dev * sqrtf(-2.f * logf(u1));
            result[i++] = mag * cosf(TAU * u2) + mean;
            if (i < n) {
                result[i++] = mag * sinf(TAU * u2) + mean;
            }  
        }
    }
#endif

GUF_RAND_KWRDS void guf_rand_normal_sample_f32(guf_randstate *state, float mean, float std_dev, float *result, ptrdiff_t n)
{
    #ifdef GUF_RAND_32_BIT
        guf_rand32_normal_sample_f32(state, mean, std_dev, result, n);
    #else
        guf_rand64_normal_sample_f32(state, mean, std_dev, result, n);
    #endif
}

GUF_RAND_KWRDS double guf_rand32_normal_sample_one_f64(guf_rand32_state *state, double mean, double std_dev)
{
    double result; 
    guf_rand32_normal_sample_f64(state, mean, std_dev, &result, 1);
    return result;
}

#ifdef UINT64_MAX
    GUF_RAND_KWRDS double guf_rand64_normal_sample_one_f64(guf_rand64_state *state, double mean, double std_dev)
    {
        double result; 
        guf_rand64_normal_sample_f64(state, mean, std_dev, &result, 1);
        return result;
    }
#endif

GUF_RAND_KWRDS double guf_rand_normal_sample_one_f64(guf_randstate *state, double mean, double std_dev)
{
    #ifdef GUF_RAND_32_BIT
        return guf_rand32_normal_sample_one_f64(state, mean, std_dev);
    #else
        return guf_rand64_normal_sample_one_f64(state, mean, std_dev);
    #endif
}

GUF_RAND_KWRDS float guf_rand32_normal_sample_one_f32(guf_rand32_state *state, float mean, float std_dev)
{
    float result;
    guf_rand32_normal_sample_f32(state, mean, std_dev, &result, 1);
    return result;
}

#ifdef UINT64_MAX
    GUF_RAND_KWRDS float guf_rand64_normal_sample_one_f32(guf_rand64_state *state, float mean, float std_dev)
    {
        float result;
        guf_rand64_normal_sample_f32(state, mean, std_dev, &result, 1);
        return result;
    }
#endif

GUF_RAND_KWRDS float guf_rand_normal_sample_one_f32(guf_randstate *state, float mean, float std_dev)
{
    #ifdef GUF_RAND_32_BIT
        return guf_rand32_normal_sample_one_f32(state, mean, std_dev);
    #else
        return guf_rand64_normal_sample_one_f32(state, mean, std_dev);
    #endif
}

#undef GUF_RAND_IMPL
#undef GUF_RAND_IMPL_STATIC
#endif /* end impl */

#undef GUF_RAND_KWRDS