Coverage Report

Created: 2024-04-19 06:04

/builds/xfbs/passgen/include/passgen/markov.h
Line
Count
Source
1
/// @file markov.h
2
/// @author Patrick M. Elsen <pelsen@xfbs.net>
3
/// @brief Markov chain data structure and functions.
4
///
5
/// The data structures and functions in this file are used to generate
6
/// pronounceable phrases by loading in a word list, generating a Markov chain
7
/// from it, and then using the Markov chain to generate sequences that are
8
/// pronounceable in English but not real words.
9
#pragma once
10
#include "passgen/util/random.h"
11
#include <stddef.h>
12
#include <stdint.h>
13
#include <stdbool.h>
14
15
/// Get the child with the given codepoint.
16
/// @memberof passgen_markov_node
17
/// @param node Node to get child of
18
/// @param codepoint Codepoint to lookup child for
19
#define passgen_markov_node_child(node, codepoint) \
20
8.33M
    node->entries[(node->capacity + 1) / 2].child[codepoint % node->capacity]
21
22
/// Get the codepoint at the given index
23
/// @memberof passgen_markov_node
24
/// @param node Node to get codepoint for
25
/// @param index Index to lookup
26
#define passgen_markov_node_codepoint(node, index) \
27
128k
    node->entries[0].codepoint[index % node->capacity]
28
29
/// Leaf node in the Markov chain. This contains a sorted list of
30
/// codepoint-count tuples.  During insertion, these are sorted by the
31
/// codepoint. When finalizing the Markov chain, they might be reordered to be
32
/// sorted by the frequency to allow for more efficient generation.
33
typedef struct {
34
    size_t total_count;
35
    size_t capacity;
36
    union {
37
        uint32_t count[1];
38
        uint32_t codepoint[1];
39
    } entries[0];
40
} passgen_markov_leaf;
41
42
/// Intermediate nodes in the markov chain tree. Implemented as custom hash map
43
/// where the keys are codepoints and the values are nodes or leafs.
44
typedef struct passgen_markov_node {
45
  size_t capacity;
46
  union {
47
    uint32_t codepoint[1];
48
    union {
49
        struct passgen_markov_node *node;
50
        passgen_markov_leaf *leaf;
51
    } child[1];
52
  } entries[0];
53
} passgen_markov_node;
54
55
/// Markov chain, used to generate pronounceable random words
56
///
57
/// Contains a tree of unicode codepoints and frequency data.
58
///
59
/// This data structure answers the question "given the previous characters x
60
/// and y, what are the probabilities for the next character?". Using
61
/// #passgen_markov_add, you can add words to it. Given a sequence of
62
/// characters, using #passgen_markov_generate you can generate a new
63
/// character at random.
64
///
65
/// The @ref length of a Markov chain is the length of tuples that it looks at.
66
typedef struct {
67
    /// Length of codepoint tuples that this Markov chains stores.
68
    size_t level;
69
    /// How many words are in this Markov chain
70
    size_t count;
71
    /// Root node of markov chain
72
    passgen_markov_node *root;
73
} passgen_markov;
74
75
76
/// Allocates a new Markov chain node with a size determined by the @ref
77
/// size_index.
78
///
79
/// Markov chain nodes only exist in specific, discrete sizes that are prime
80
/// numbers.  This is because they implement a kind of hash map and that makes
81
/// them more efficient.  The @ref size_index is an index into a static array
82
/// of precomputed prime sizes.
83
///
84
/// @memberof passgen_markov_node
85
passgen_markov_node *passgen_markov_node_new(size_t size_index);
86
87
/// Free a Markov chain node.
88
///
89
/// The parameter @ref level is the level of the Markov chain, this needs to be
90
/// passed so that the method knows how deep to free data.
91
///
92
/// @memberof passgen_markov_node
93
void passgen_markov_node_free(passgen_markov_node *node, size_t level);
94
95
/// Insert a new codepoint into a Markov chain node.
96
///
97
/// This returns a pointer to a passgen_markov_node because the node may need
98
/// to be reallocated if it would otherwise overflow.
99
///
100
/// @memberof passgen_markov_node
101
passgen_markov_node *passgen_markov_node_insert(passgen_markov_node *node, uint32_t codepoint);
102
103
/// Insert a new word into this Markov chain node.
104
///
105
/// This will recursively insert the word into the appropriate child nodes. It
106
/// needs to know the @ref length of the word (this is equal to the @ref level
107
/// of the Markov chain, minus the depth it has already traversed) and the
108
/// weight of the word.
109
///
110
/// @memberof passgen_markov_node
111
passgen_markov_node *passgen_markov_node_insert_word(
112
    passgen_markov_node *node,
113
    const uint32_t *sequence,
114
    size_t length,
115
    size_t weight);
116
117
/// Allocates a new Markov leaf node with a size determined by the @ref
118
/// size_index.
119
///
120
/// Markov leaf nodes only exist in specific, discrete sizes that are prime
121
/// numbers.  This is because they implement a kind of hash map that makes them
122
/// more efficient.  The @ref size_index is an index into a static array of
123
/// precomputed prime sizes.
124
///
125
/// @memberof passgen_markov_leaf
126
passgen_markov_leaf *passgen_markov_leaf_new(size_t size_index);
127
128
/// Free a Markov leaf node.
129
///
130
/// @memberof passgen_markov_leaf
131
void passgen_markov_leaf_free(passgen_markov_leaf *leaf);
132
133
/// Insert a final codepoint and weight into a Markov leaf node.
134
///
135
/// @memberof passgen_markov_leaf
136
passgen_markov_leaf *passgen_markov_leaf_insert(passgen_markov_leaf *leaf, uint32_t codepoint, size_t weight);
137
138
/// Given a codepoint, return a count or zero if the codepoint does not exist.
139
///
140
/// @memberof passgen_markov_leaf
141
uint32_t passgen_markov_leaf_count(passgen_markov_leaf *leaf, uint32_t codepoint);
142
143
/// Initialize new markov chain with a given level
144
///
145
/// @memberof passgen_markov
146
void passgen_markov_init(passgen_markov *markov, uint8_t level);
147
148
/// Add a word to a markov chain
149
///
150
/// @memberof passgen_markov
151
void passgen_markov_add(
152
    passgen_markov *markov,
153
    const uint32_t *word,
154
    size_t word_len,
155
    size_t weight);
156
157
/// Generate the next character at random using the Markov chain and a @ref passgen_random instance.
158
/// @param current Must contain markov->level amount of previous chars
159
/// @param markov Must be a parsed markov chain
160
/// @param random Must be a valid randomness instance
161
/// @param complexity Will be multiplied with the complexity of the choice if it not NULL.
162
/// @return The generated next character
163
/// @memberof passgen_markov
164
uint32_t passgen_markov_generate(
165
    passgen_markov *markov,
166
    const uint32_t *current,
167
    passgen_random *random,
168
    double *complexity);
169
170
/// Free a markov chain
171
///
172
/// @memberof passgen_markov
173
void passgen_markov_free(passgen_markov *markov);