Coverage Report

Created: 2024-05-03 06:05

/builds/xfbs/passgen/src/markov.c
Line
Count
Source (jump to first uncovered line)
1
#include "passgen/markov.h"
2
#include "passgen/assert.h"
3
#include <assert.h>
4
#include <stdlib.h>
5
#include <string.h>
6
18.1k
#define MIN(a, b)            ((a) < (b) ? 
(a)8.42k
:
(b)9.74k
)
7
8.40k
#define SATURATING_SUB(a, b) ((a) -MIN(a, b))
8
9
#define MARKOV_LEAF_INITIAL    4
10
#define MARKOV_LEAF_MULTIPLIER 2
11
12
#define passgen_markov_leaf_count_raw(leaf, codepoint) \
13
91.5M
    leaf->entries[leaf->capacity].count[codepoint % leaf->capacity]
14
15
#define passgen_markov_leaf_codepoint_raw(leaf, index) \
16
731k
    leaf->entries[0].codepoint[index % leaf->capacity]
17
18
/// Sizes for the leaf and node hash maps. These are all prime numbers, with the
19
/// property that every number is greater than double the previous one.
20
/// Generated with Ruby with:
21
///
22
/// ```ruby
23
/// require 'prime'
24
/// p Prime.take(10000).inject([3]){|l, p| l << p if (2*l.last) < p; l}
25
/// ```
26
static const size_t markov_sizes[] = {
27
    3,        7,        17,       37,     79,      163,     331,
28
    673,      1361,     2729,     5471,   10949,   21911,   43853,
29
    87719,    175447,   350899,   701819, 1403641, 2807303, 5614657,
30
    11229331, 22458671, 44917381, 0};
31
32
30.8k
size_t passgen_markov_node_size(size_t capacity) {
33
30.8k
    // we use capacity + 1 there to make sure it's a round number, such that the
34
30.8k
    // child pointers are not misaligned.
35
30.8k
    return sizeof(passgen_markov_node) + (capacity + 1) * sizeof(uint32_t) +
36
30.8k
           capacity * sizeof(passgen_markov_node *);
37
30.8k
}
38
39
9.38k
size_t passgen_markov_leaf_size(size_t capacity) {
40
9.38k
    return sizeof(passgen_markov_leaf) + capacity * sizeof(uint32_t) +
41
9.38k
           capacity * sizeof(uint32_t);
42
9.38k
}
43
44
30.8k
passgen_markov_node *passgen_markov_node_new(size_t size_index) {
45
30.8k
    size_t capacity = markov_sizes[size_index];
46
30.8k
    passgen_assert(capacity);
47
30.8k
    size_t size = passgen_markov_node_size(capacity);
48
30.8k
    passgen_markov_node *node = calloc(1, size);
49
30.8k
    memset(node, 0, size);
50
30.8k
    node->capacity = capacity;
51
30.8k
    return node;
52
30.8k
}
53
54
9.38k
passgen_markov_leaf *passgen_markov_leaf_new(size_t size_index) {
55
9.38k
    size_t capacity = markov_sizes[size_index];
56
9.38k
    if(capacity == 0) {
57
0
        return NULL;
58
0
    }
59
9.38k
    size_t size = passgen_markov_leaf_size(capacity);
60
9.38k
    passgen_markov_leaf *leaf = calloc(1, size);
61
9.38k
    memset(leaf, 0, size);
62
9.38k
    leaf->capacity = capacity;
63
9.38k
    leaf->total_count = 0;
64
9.38k
    return leaf;
65
9.38k
}
66
67
9.18k
void passgen_markov_leaf_free(passgen_markov_leaf *leaf) {
68
9.18k
    free(leaf);
69
9.18k
}
70
71
30.2k
void passgen_markov_node_free(passgen_markov_node *node, size_t level) {
72
4.11M
    for(size_t i = 0; i < node->capacity; 
i++4.08M
) {
73
4.08M
        void *child = passgen_markov_node_child(node, i).leaf;
74
4.08M
        if(child) {
75
39.4k
            if(level < 2) {
76
9.17k
                passgen_markov_leaf_free(child);
77
30.2k
            } else {
78
30.2k
                passgen_markov_node_free(child, level - 1);
79
30.2k
            }
80
39.4k
        }
81
4.08M
    }
82
30.2k
83
30.2k
    free(node);
84
30.2k
}
85
86
0
void passgen_markov_node_check(passgen_markov_node *node, size_t level) {
87
0
    assert(node->capacity);
88
0
    assert(node->capacity % 2);
89
0
    for(size_t i = 0; i < node->capacity; i++) {
90
0
        void *data = passgen_markov_node_child(node, i).node;
91
0
        uint32_t codepoint = passgen_markov_node_codepoint(node, i);
92
0
        if(codepoint) {
93
0
            assert(data);
94
0
        }
95
0
        if(data) {
96
0
            assert((codepoint % node->capacity) == i);
97
0
            if(level > 2) {
98
0
                passgen_markov_node_check(data, level - 1);
99
0
            }
100
0
        }
101
0
    }
102
0
}
103
104
0
void passgen_markov_check(passgen_markov *markov) {
105
0
    passgen_markov_node_check(markov->root, markov->level);
106
0
}
107
108
// Initialize new markov chain with a given level
109
13
void passgen_markov_init(passgen_markov *markov, uint8_t level) {
110
13
    passgen_assert(level);
111
13
    memset(markov, 0, sizeof(*markov));
112
13
    markov->level = level;
113
13
    markov->root = passgen_markov_node_new(0);
114
13
}
115
116
198
passgen_markov_leaf *passgen_markov_leaf_realloc(passgen_markov_leaf *leaf) {
117
198
    size_t size_index = 0;
118
988
    while(markov_sizes[size_index] < leaf->capacity) {
119
790
        size_index++;
120
790
    }
121
198
122
198
    passgen_markov_leaf *new = passgen_markov_leaf_new(size_index + 1);
123
198
124
198
    // re-insert old elements
125
45.1M
    for(size_t i = 0; i < leaf->capacity; 
i++45.1M
) {
126
45.1M
        if(passgen_markov_leaf_count_raw(leaf, i)) {
127
193k
            uint32_t codepoint = passgen_markov_leaf_codepoint_raw(leaf, i);
128
193k
            uint32_t count = passgen_markov_leaf_count_raw(leaf, i);
129
193k
            new = passgen_markov_leaf_insert(new, codepoint, count);
130
193k
        }
131
45.1M
    }
132
198
133
198
    free(leaf);
134
198
135
198
    return new;
136
198
}
137
138
passgen_markov_leaf *passgen_markov_leaf_insert(
139
    passgen_markov_leaf *leaf,
140
    uint32_t codepoint,
141
315k
    size_t weight) {
142
315k
    // don't do anything if asked to insert a codepoint with an empty weight.
143
315k
    if(weight == 0) {
144
0
        return leaf;
145
0
    }
146
315k
147
316k
    
while(315k
passgen_markov_leaf_count_raw(leaf, codepoint) &&
148
316k
          
passgen_markov_leaf_codepoint_raw871
(leaf, codepoint) != codepoint871
) {
149
198
        leaf = passgen_markov_leaf_realloc(leaf);
150
198
    }
151
315k
152
315k
    // set codepoint
153
315k
    passgen_markov_leaf_codepoint_raw(leaf, codepoint) = codepoint;
154
315k
    passgen_markov_leaf_count_raw(leaf, codepoint) += weight;
155
315k
    leaf->total_count += weight;
156
315k
157
315k
    return leaf;
158
315k
}
159
160
588
passgen_markov_node *passgen_markov_node_realloc(passgen_markov_node *node) {
161
588
    size_t size_index = 0;
162
3.13k
    while(markov_sizes[size_index] < node->capacity) {
163
2.54k
        size_index++;
164
2.54k
    }
165
588
166
588
    passgen_markov_node *new = passgen_markov_node_new(size_index + 1);
167
588
168
588
    // re-insert old elements
169
3.99M
    for(size_t i = 0; i < node->capacity; 
i++3.99M
) {
170
3.99M
        if(passgen_markov_node_child(node, i).node) {
171
22.0k
            uint32_t codepoint = passgen_markov_node_codepoint(node, i);
172
22.0k
            new = passgen_markov_node_insert(new, codepoint);
173
22.0k
            assert(!passgen_markov_node_child(new, codepoint).node);
174
22.0k
            passgen_markov_node_child(new, codepoint).node =
175
22.0k
                passgen_markov_node_child(node, i).node;
176
22.0k
        }
177
3.99M
    }
178
588
179
588
    free(node);
180
588
181
588
    return new;
182
588
}
183
184
passgen_markov_node *
185
83.9k
passgen_markov_node_insert(passgen_markov_node *node, uint32_t codepoint) {
186
84.5k
    while(passgen_markov_node_child(node, codepoint).node &&
187
84.5k
          
passgen_markov_node_codepoint22.0k
(node, codepoint) != codepoint22.0k
) {
188
588
        node = passgen_markov_node_realloc(node);
189
588
    }
190
83.9k
191
83.9k
    // set codepoint
192
83.9k
    passgen_markov_node_codepoint(node, codepoint) = codepoint;
193
83.9k
194
83.9k
    return node;
195
83.9k
}
196
197
passgen_markov_node *passgen_markov_node_insert_word(
198
    passgen_markov_node *node,
199
    const uint32_t *sequence,
200
    size_t length,
201
60.8k
    size_t weight) {
202
60.8k
    uint32_t codepoint = sequence[0];
203
60.8k
    node = passgen_markov_node_insert(node, codepoint);
204
60.8k
205
60.8k
    if(length == 2) {
206
11.2k
        passgen_markov_leaf *child =
207
11.2k
            passgen_markov_node_child(node, codepoint).leaf;
208
11.2k
209
11.2k
        if(!child) {
210
9.17k
            child = passgen_markov_leaf_new(0);
211
9.17k
        }
212
11.2k
213
11.2k
        child = passgen_markov_leaf_insert(child, sequence[1], weight);
214
11.2k
215
11.2k
        passgen_markov_node_child(node, codepoint).leaf = child;
216
49.6k
    } else {
217
49.6k
        // retrieve or create child node
218
49.6k
        passgen_markov_node *child =
219
49.6k
            passgen_markov_node_child(node, codepoint).node;
220
49.6k
        if(!child) {
221
30.2k
            child = passgen_markov_node_new(0);
222
30.2k
        }
223
49.6k
224
49.6k
        // insert word into child
225
49.6k
        child = passgen_markov_node_insert_word(
226
49.6k
            child,
227
49.6k
            &sequence[1],
228
49.6k
            length - 1,
229
49.6k
            weight);
230
49.6k
231
49.6k
        // save child
232
49.6k
        passgen_markov_node_child(node, codepoint).node = child;
233
49.6k
    }
234
60.8k
235
60.8k
    return node;
236
60.8k
}
237
238
void passgen_markov_insert(
239
    passgen_markov *markov,
240
    const uint32_t *sequence,
241
11.2k
    size_t weight) {
242
11.2k
    markov->root = passgen_markov_node_insert_word(
243
11.2k
        markov->root,
244
11.2k
        sequence,
245
11.2k
        markov->level + 1,
246
11.2k
        weight);
247
11.2k
    markov->count += weight;
248
11.2k
}
249
250
// Add a word to a markov chain
251
void passgen_markov_add(
252
    passgen_markov *markov,
253
    const uint32_t *word,
254
    size_t word_len,
255
1.39k
    size_t weight) {
256
1.39k
    // insert start sequences
257
1.39k
    size_t sequence_len = 2 * markov->level + 1;
258
1.39k
    uint32_t sequence[sequence_len];
259
1.39k
    memset(sequence, 0, sizeof(sequence[0]) * sequence_len);
260
1.39k
    memcpy(
261
1.39k
        &sequence[markov->level],
262
1.39k
        word,
263
1.39k
        sizeof(uint32_t) * MIN(markov->level, word_len));
264
6.97k
    for(size_t i = 0; i < MIN(markov->level, word_len); 
i++5.58k
) {
265
5.58k
        passgen_markov_insert(markov, &sequence[i], weight);
266
5.58k
    }
267
5.62k
    for(size_t i = 0; i < SATURATING_SUB(word_len, markov->level); 
i++4.23k
) {
268
4.23k
        passgen_markov_insert(markov, &word[i], weight);
269
4.23k
    }
270
1.39k
    memset(sequence, 0, sizeof(sequence[0]) * sequence_len);
271
1.39k
    memcpy(
272
1.39k
        &sequence[SATURATING_SUB(markov->level, word_len)],
273
1.39k
        &word[SATURATING_SUB(word_len, markov->level)],
274
1.39k
        sizeof(uint32_t) * MIN(markov->level, word_len));
275
1.39k
    passgen_markov_insert(markov, &sequence[0], weight);
276
1.39k
}
277
278
// Free a markov chain
279
13
void passgen_markov_free(passgen_markov *markov) {
280
13
    passgen_markov_node_free(markov->root, markov->level);
281
13
}
282
283
uint32_t passgen_markov_generate(
284
    passgen_markov *markov,
285
    const uint32_t *current,
286
    passgen_random *random,
287
0
    double *entropy) {
288
0
    passgen_markov_node *node = markov->root;
289
0
290
0
    for(size_t i = 0; i < markov->level; i++) {
291
0
        node =
292
0
            (passgen_markov_node *) passgen_markov_node_child(node, current[i])
293
0
                .leaf;
294
0
        assert(node);
295
0
    }
296
0
297
0
    passgen_markov_leaf *leaf = (passgen_markov_leaf *) node;
298
0
299
0
    // record total choices.
300
0
    if(entropy) {
301
0
        *entropy *= (double) leaf->total_count;
302
0
    }
303
0
304
0
    size_t choice = passgen_random_u64_max(random, leaf->total_count);
305
0
    for(size_t i = 0; i < leaf->capacity; i++) {
306
0
        uint32_t count = passgen_markov_leaf_count_raw(leaf, i);
307
0
        if(count) {
308
0
            if(choice < count) {
309
0
                // record bucket (reduces total choices).
310
0
                if(entropy) {
311
0
                    *entropy /= (double) count;
312
0
                }
313
0
314
0
                return passgen_markov_leaf_codepoint_raw(leaf, i);
315
0
            } else {
316
0
                choice -= count;
317
0
            }
318
0
        }
319
0
    }
320
0
321
0
    assert(false);
322
0
323
0
    return 0;
324
0
}
325
326
uint32_t
327
221k
passgen_markov_leaf_count(passgen_markov_leaf *leaf, uint32_t codepoint) {
328
221k
    if(passgen_markov_leaf_codepoint_raw(leaf, codepoint) == codepoint) {
329
110k
        return passgen_markov_leaf_count_raw(leaf, codepoint);
330
110k
    } else {
331
110k
        return 0;
332
110k
    }
333
221k
}