summaryrefslogtreecommitdiff
path: root/venv/lib/python3.11/site-packages/faker/utils/distribution.py
blob: 45580a5bfc8eea3908c261c104ad2c4db8426020 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import bisect
import itertools

from random import Random
from typing import Generator, Iterable, Optional, Sequence, TypeVar

from faker.generator import random as mod_random


def random_sample(random: Optional[Random] = None) -> float:
    if random is None:
        random = mod_random
    return random.uniform(0.0, 1.0)


def cumsum(it: Iterable[float]) -> Generator[float, None, None]:
    total: float = 0
    for x in it:
        total += x
        yield total


T = TypeVar("T")


def choices_distribution_unique(
    a: Sequence[T],
    p: Optional[Sequence[float]],
    random: Optional[Random] = None,
    length: int = 1,
) -> Sequence[T]:
    # As of Python 3.7, there isn't a way to sample unique elements that takes
    # weight into account.
    if random is None:
        random = mod_random

    assert p is not None
    assert len(a) == len(p)
    assert len(a) >= length, "You can't request more unique samples than elements in the dataset."

    choices = []
    items = list(a)
    probabilities = list(p)
    for i in range(length):
        cdf = tuple(cumsum(probabilities))
        normal = cdf[-1]
        cdf2 = [i / normal for i in cdf]
        uniform_sample = random_sample(random=random)
        idx = bisect.bisect_right(cdf2, uniform_sample)
        item = items[idx]
        choices.append(item)
        probabilities.pop(idx)
        items.pop(idx)
    return choices


def choices_distribution(
    a: Sequence[T],
    p: Optional[Sequence[float]],
    random: Optional[Random] = None,
    length: int = 1,
) -> Sequence[T]:
    if random is None:
        random = mod_random

    if p is not None:
        assert len(a) == len(p)

    if hasattr(random, "choices"):
        if length == 1 and p is None:
            return [random.choice(a)]
        else:
            return random.choices(a, weights=p, k=length)
    else:
        choices = []

        if p is None:
            p = itertools.repeat(1, len(a))  # type: ignore

        cdf = list(cumsum(p))  # type: ignore
        normal = cdf[-1]
        cdf2 = [i / normal for i in cdf]
        for i in range(length):
            uniform_sample = random_sample(random=random)
            idx = bisect.bisect_right(cdf2, uniform_sample)
            item = a[idx]
            choices.append(item)
        return choices