Coverage Report

Created: 2024-11-29 06:05

/builds/passgen/passgen-c/src/markov.c
Line
Count
Source (jump to first uncovered line)
1
#include "passgen/markov.h"
2
#include "passgen/assert.h"
3
#include "passgen/config.h"
4
#include <assert.h>
5
#include <stdlib.h>
6
#include <string.h>
7
48.7k
#define MIN(a, b)            ((a) < (b) ? 
(a)22.4k
:
(b)26.3k
)
8
26.7k
#define SATURATING_SUB(a, b) ((a) -MIN(a, b))
9
10
#define MARKOV_LEAF_INITIAL    4
11
#define MARKOV_LEAF_MULTIPLIER 2
12
13
#define passgen_markov_leaf_count_raw(leaf, codepoint) \
14
47.1M
    leaf->entries[leaf->capacity].count[codepoint % leaf->capacity]
15
16
#define passgen_markov_leaf_codepoint_raw(leaf, index) \
17
730k
    leaf->entries[0].codepoint[index % leaf->capacity]
18
19
/// Sizes for the leaf and node hash maps. These are all prime numbers, with the
20
/// property that every number is greater than double the previous one.
21
/// Generated with Ruby with:
22
///
23
/// ```ruby
24
/// require 'prime'
25
/// p Prime.take(10000).inject([3]){|l, p| l << p if (2*l.last) < p; l}
26
/// ```
27
static const size_t markov_sizes[] = {
28
    3,        7,        17,       37,     79,      163,     331,
29
    673,      1361,     2729,     5471,   10949,   21911,   43853,
30
    87719,    175447,   350899,   701819, 1403641, 2807303, 5614657,
31
    11229331, 22458671, 44917381, 0};
32
33
81.7k
size_t passgen_markov_node_size(size_t capacity) {
34
81.7k
    // we use capacity + 1 there to make sure it's a round number, such that the
35
81.7k
    // child pointers are not misaligned.
36
81.7k
    return sizeof(passgen_markov_node) + (capacity + 1) * sizeof(uint32_t) +
37
81.7k
           capacity * sizeof(passgen_markov_node *);
38
81.7k
}
39
40
29.8k
size_t passgen_markov_leaf_size(size_t capacity) {
41
29.8k
    return sizeof(passgen_markov_leaf) + capacity * sizeof(uint32_t) +
42
29.8k
           capacity * sizeof(uint32_t);
43
29.8k
}
44
45
81.7k
passgen_markov_node *passgen_markov_node_new(size_t size_index) {
46
81.7k
    size_t capacity = markov_sizes[size_index];
47
81.7k
    passgen_assert(capacity);
48
81.7k
    size_t size = passgen_markov_node_size(capacity);
49
81.7k
    passgen_markov_node *node = calloc(1, size);
50
81.7k
    memset(node, 0, size);
51
81.7k
    node->capacity = capacity;
52
81.7k
    return node;
53
81.7k
}
54
55
29.8k
passgen_markov_leaf *passgen_markov_leaf_new(size_t size_index) {
56
29.8k
    size_t capacity = markov_sizes[size_index];
57
29.8k
    if(capacity == 0) {
58
0
        return NULL;
59
0
    }
60
29.8k
    size_t size = passgen_markov_leaf_size(capacity);
61
29.8k
    passgen_markov_leaf *leaf = calloc(1, size);
62
29.8k
    memset(leaf, 0, size);
63
29.8k
    leaf->capacity = capacity;
64
29.8k
    leaf->total_count = 0;
65
29.8k
    return leaf;
66
29.8k
}
67
68
29.5k
void passgen_markov_leaf_free(passgen_markov_leaf *leaf) {
69
29.5k
    PASSGEN_CLEAR(leaf);
70
29.5k
    free(leaf);
71
29.5k
}
72
73
80.8k
void passgen_markov_node_free(passgen_markov_node *node, size_t level) {
74
7.38M
    for(size_t i = 0; i < node->capacity; 
i++7.30M
) {
75
7.30M
        void *child = passgen_markov_node_child(node, i).leaf;
76
7.30M
        if(child) {
77
110k
            if(level < 2) {
78
29.5k
                passgen_markov_leaf_free(child);
79
80.8k
            } else {
80
80.8k
                passgen_markov_node_free(child, level - 1);
81
80.8k
            }
82
110k
        }
83
7.30M
    }
84
80.8k
85
80.8k
    PASSGEN_CLEAR(node);
86
80.8k
    free(node);
87
80.8k
}
88
89
0
void passgen_markov_node_check(passgen_markov_node *node, size_t level) {
90
0
    assert(node->capacity);
91
0
    assert(node->capacity % 2);
92
0
    for(size_t i = 0; i < node->capacity; i++) {
93
0
        void *data = passgen_markov_node_child(node, i).node;
94
0
        uint32_t codepoint = passgen_markov_node_codepoint(node, i);
95
0
        if(codepoint) {
96
0
            assert(data);
97
0
        }
98
0
        if(data) {
99
0
            assert((codepoint % node->capacity) == i);
100
0
            if(level > 2) {
101
0
                passgen_markov_node_check(data, level - 1);
102
0
            }
103
0
        }
104
0
    }
105
0
}
106
107
0
void passgen_markov_check(passgen_markov *markov) {
108
0
    passgen_markov_node_check(markov->root, markov->level);
109
0
}
110
111
// Initialize new markov chain with a given level
112
14
void passgen_markov_init(passgen_markov *markov, uint8_t level) {
113
14
    passgen_assert(level);
114
14
    memset(markov, 0, sizeof(*markov));
115
14
    markov->level = level;
116
14
    markov->root = passgen_markov_node_new(0);
117
14
}
118
119
225
passgen_markov_leaf *passgen_markov_leaf_realloc(passgen_markov_leaf *leaf) {
120
225
    size_t size_index = 0;
121
1.14k
    while(markov_sizes[size_index] < leaf->capacity) {
122
924
        size_index++;
123
924
    }
124
225
125
225
    passgen_markov_leaf *new = passgen_markov_leaf_new(size_index + 1);
126
225
127
225
    // re-insert old elements
128
1.37M
    for(size_t i = 0; i < leaf->capacity; 
i++1.37M
) {
129
1.37M
        if(passgen_markov_leaf_count_raw(leaf, i)) {
130
184k
            uint32_t codepoint = passgen_markov_leaf_codepoint_raw(leaf, i);
131
184k
            uint32_t count = passgen_markov_leaf_count_raw(leaf, i);
132
184k
            new = passgen_markov_leaf_insert(new, codepoint, count);
133
184k
        }
134
1.37M
    }
135
225
136
225
    free(leaf);
137
225
138
225
    return new;
139
225
}
140
141
passgen_markov_leaf *passgen_markov_leaf_insert(
142
    passgen_markov_leaf *leaf,
143
    uint32_t codepoint,
144
319k
    size_t weight) {
145
319k
    // don't do anything if asked to insert a codepoint with an empty weight.
146
319k
    if(weight == 0) {
147
0
        return leaf;
148
0
    }
149
319k
150
319k
    
while(319k
passgen_markov_leaf_count_raw(leaf, codepoint) &&
151
319k
          
passgen_markov_leaf_codepoint_raw891
(leaf, codepoint) != codepoint891
) {
152
225
        leaf = passgen_markov_leaf_realloc(leaf);
153
225
    }
154
319k
155
319k
    // set codepoint
156
319k
    passgen_markov_leaf_codepoint_raw(leaf, codepoint) = codepoint;
157
319k
    passgen_markov_leaf_count_raw(leaf, codepoint) += weight;
158
319k
    leaf->total_count += weight;
159
319k
160
319k
    return leaf;
161
319k
}
162
163
913
passgen_markov_node *passgen_markov_node_realloc(passgen_markov_node *node) {
164
913
    size_t size_index = 0;
165
4.69k
    while(markov_sizes[size_index] < node->capacity) {
166
3.78k
        size_index++;
167
3.78k
    }
168
913
169
913
    passgen_markov_node *new = passgen_markov_node_new(size_index + 1);
170
913
171
913
    // re-insert old elements
172
7.06M
    for(size_t i = 0; i < node->capacity; 
i++7.05M
) {
173
7.05M
        if(passgen_markov_node_child(node, i).node) {
174
37.7k
            uint32_t codepoint = passgen_markov_node_codepoint(node, i);
175
37.7k
            new = passgen_markov_node_insert(new, codepoint);
176
37.7k
            assert(!passgen_markov_node_child(new, codepoint).node);
177
37.7k
            passgen_markov_node_child(new, codepoint).node =
178
37.7k
                passgen_markov_node_child(node, i).node;
179
37.7k
        }
180
7.05M
    }
181
913
182
913
    free(node);
183
913
184
913
    return new;
185
913
}
186
187
passgen_markov_node *
188
193k
passgen_markov_node_insert(passgen_markov_node *node, uint32_t codepoint) {
189
194k
    while(passgen_markov_node_child(node, codepoint).node &&
190
194k
          
passgen_markov_node_codepoint45.0k
(node, codepoint) != codepoint45.0k
) {
191
913
        node = passgen_markov_node_realloc(node);
192
913
    }
193
193k
194
193k
    // set codepoint
195
193k
    passgen_markov_node_codepoint(node, codepoint) = codepoint;
196
193k
197
193k
    return node;
198
193k
}
199
200
passgen_markov_node *passgen_markov_node_insert_word(
201
    passgen_markov_node *node,
202
    const uint32_t *sequence,
203
    size_t length,
204
154k
    size_t weight) {
205
154k
    uint32_t codepoint = sequence[0];
206
154k
    node = passgen_markov_node_insert(node, codepoint);
207
154k
208
154k
    if(length == 2) {
209
33.3k
        passgen_markov_leaf *child =
210
33.3k
            passgen_markov_node_child(node, codepoint).leaf;
211
33.3k
212
33.3k
        if(!child) {
213
29.5k
            child = passgen_markov_leaf_new(0);
214
29.5k
        }
215
33.3k
216
33.3k
        child = passgen_markov_leaf_insert(child, sequence[1], weight);
217
33.3k
218
33.3k
        passgen_markov_node_child(node, codepoint).leaf = child;
219
121k
    } else {
220
121k
        // retrieve or create child node
221
121k
        passgen_markov_node *child =
222
121k
            passgen_markov_node_child(node, codepoint).node;
223
121k
        if(!child) {
224
80.8k
            child = passgen_markov_node_new(0);
225
80.8k
        }
226
121k
227
121k
        // insert word into child
228
121k
        child = passgen_markov_node_insert_word(
229
121k
            child,
230
121k
            &sequence[1],
231
121k
            length - 1,
232
121k
            weight);
233
121k
234
121k
        // save child
235
121k
        passgen_markov_node_child(node, codepoint).node = child;
236
121k
    }
237
154k
238
154k
    return node;
239
154k
}
240
241
void passgen_markov_insert(
242
    passgen_markov *markov,
243
    const uint32_t *sequence,
244
33.3k
    size_t weight) {
245
33.3k
    markov->root = passgen_markov_node_insert_word(
246
33.3k
        markov->root,
247
33.3k
        sequence,
248
33.3k
        markov->level + 1,
249
33.3k
        weight);
250
33.3k
    markov->count += weight;
251
33.3k
}
252
253
// Add a word to a markov chain
254
void passgen_markov_add(
255
    passgen_markov *markov,
256
    const uint32_t *word,
257
    size_t word_len,
258
3.09k
    size_t weight) {
259
3.09k
    // insert start sequences
260
3.09k
    size_t sequence_len = 2 * markov->level + 1;
261
3.09k
    uint32_t sequence[sequence_len];
262
3.09k
    memset(sequence, 0, sizeof(sequence[0]) * sequence_len);
263
3.09k
    memcpy(
264
3.09k
        &sequence[markov->level],
265
3.09k
        word,
266
3.09k
        sizeof(uint32_t) * MIN(markov->level, word_len));
267
15.8k
    for(size_t i = 0; i < MIN(markov->level, word_len); 
i++12.7k
) {
268
12.7k
        passgen_markov_insert(markov, &sequence[i], weight);
269
12.7k
    }
270
20.5k
    for(size_t i = 0; i < SATURATING_SUB(word_len, markov->level); 
i++17.4k
) {
271
17.4k
        passgen_markov_insert(markov, &word[i], weight);
272
17.4k
    }
273
3.09k
    memset(sequence, 0, sizeof(sequence[0]) * sequence_len);
274
3.09k
    memcpy(
275
3.09k
        &sequence[SATURATING_SUB(markov->level, word_len)],
276
3.09k
        &word[SATURATING_SUB(word_len, markov->level)],
277
3.09k
        sizeof(uint32_t) * MIN(markov->level, word_len));
278
3.09k
    passgen_markov_insert(markov, &sequence[0], weight);
279
3.09k
}
280
281
// Free a markov chain
282
14
void passgen_markov_free(passgen_markov *markov) {
283
14
    passgen_markov_node_free(markov->root, markov->level);
284
14
    PASSGEN_CLEAR(markov);
285
14
}
286
287
uint32_t passgen_markov_generate(
288
    passgen_markov *markov,
289
    const uint32_t *current,
290
    passgen_random *random,
291
13.0k
    double *entropy) {
292
13.0k
    passgen_markov_node *node = markov->root;
293
13.0k
294
52.0k
    for(size_t i = 0; i < markov->level; 
i++39.0k
) {
295
39.0k
        node =
296
39.0k
            (passgen_markov_node *) passgen_markov_node_child(node, current[i])
297
39.0k
                .leaf;
298
39.0k
        assert(node);
299
39.0k
    }
300
13.0k
301
13.0k
    passgen_markov_leaf *leaf = (passgen_markov_leaf *) node;
302
13.0k
303
13.0k
    // record total choices.
304
13.0k
    if(entropy) {
305
0
        *entropy *= (double) leaf->total_count;
306
0
    }
307
13.0k
308
13.0k
    size_t choice = passgen_random_u64_max(random, leaf->total_count);
309
43.1M
    for(size_t i = 0; i < leaf->capacity; 
i++43.1M
) {
310
43.1M
        uint32_t count = passgen_markov_leaf_count_raw(leaf, i);
311
43.1M
        if(count) {
312
512k
            if(choice < count) {
313
13.0k
                // record bucket (reduces total choices).
314
13.0k
                if(entropy) {
315
0
                    *entropy /= (double) count;
316
0
                }
317
13.0k
318
13.0k
                return passgen_markov_leaf_codepoint_raw(leaf, i);
319
499k
            } else {
320
499k
                choice -= count;
321
499k
            }
322
512k
        }
323
43.1M
    }
324
13.0k
325
13.0k
    assert(false);
326
0
327
0
    return 0;
328
0
}
329
330
uint32_t
331
212k
passgen_markov_leaf_count(passgen_markov_leaf *leaf, uint32_t codepoint) {
332
212k
    if(passgen_markov_leaf_codepoint_raw(leaf, codepoint) == codepoint) {
333
101k
        return passgen_markov_leaf_count_raw(leaf, codepoint);
334
110k
    } else {
335
110k
        return 0;
336
110k
    }
337
212k
}