/builds/xfbs/passgen/src/markov.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "passgen/markov.h" |
2 | | #include "passgen/assert.h" |
3 | | #include <assert.h> |
4 | | #include <stdlib.h> |
5 | | #include <string.h> |
6 | 18.1k | #define MIN(a, b) ((a) < (b) ? (a)8.42k : (b)9.74k ) |
7 | 8.40k | #define SATURATING_SUB(a, b) ((a) -MIN(a, b)) |
8 | | |
9 | | #define MARKOV_LEAF_INITIAL 4 |
10 | | #define MARKOV_LEAF_MULTIPLIER 2 |
11 | | |
12 | | #define passgen_markov_leaf_count_raw(leaf, codepoint) \ |
13 | 91.5M | leaf->entries[leaf->capacity].count[codepoint % leaf->capacity] |
14 | | |
15 | | #define passgen_markov_leaf_codepoint_raw(leaf, index) \ |
16 | 731k | leaf->entries[0].codepoint[index % leaf->capacity] |
17 | | |
18 | | /// Sizes for the leaf and node hash maps. These are all prime numbers, with the |
19 | | /// property that every number is greater than double the previous one. |
20 | | /// Generated with Ruby with: |
21 | | /// |
22 | | /// ```ruby |
23 | | /// require 'prime' |
24 | | /// p Prime.take(10000).inject([3]){|l, p| l << p if (2*l.last) < p; l} |
25 | | /// ``` |
26 | | static const size_t markov_sizes[] = { |
27 | | 3, 7, 17, 37, 79, 163, 331, |
28 | | 673, 1361, 2729, 5471, 10949, 21911, 43853, |
29 | | 87719, 175447, 350899, 701819, 1403641, 2807303, 5614657, |
30 | | 11229331, 22458671, 44917381, 0}; |
31 | | |
32 | 30.8k | size_t passgen_markov_node_size(size_t capacity) { |
33 | 30.8k | // we use capacity + 1 there to make sure it's a round number, such that the |
34 | 30.8k | // child pointers are not misaligned. |
35 | 30.8k | return sizeof(passgen_markov_node) + (capacity + 1) * sizeof(uint32_t) + |
36 | 30.8k | capacity * sizeof(passgen_markov_node *); |
37 | 30.8k | } |
38 | | |
39 | 9.38k | size_t passgen_markov_leaf_size(size_t capacity) { |
40 | 9.38k | return sizeof(passgen_markov_leaf) + capacity * sizeof(uint32_t) + |
41 | 9.38k | capacity * sizeof(uint32_t); |
42 | 9.38k | } |
43 | | |
44 | 30.8k | passgen_markov_node *passgen_markov_node_new(size_t size_index) { |
45 | 30.8k | size_t capacity = markov_sizes[size_index]; |
46 | 30.8k | passgen_assert(capacity); |
47 | 30.8k | size_t size = passgen_markov_node_size(capacity); |
48 | 30.8k | passgen_markov_node *node = calloc(1, size); |
49 | 30.8k | memset(node, 0, size); |
50 | 30.8k | node->capacity = capacity; |
51 | 30.8k | return node; |
52 | 30.8k | } |
53 | | |
54 | 9.38k | passgen_markov_leaf *passgen_markov_leaf_new(size_t size_index) { |
55 | 9.38k | size_t capacity = markov_sizes[size_index]; |
56 | 9.38k | if(capacity == 0) { |
57 | 0 | return NULL; |
58 | 0 | } |
59 | 9.38k | size_t size = passgen_markov_leaf_size(capacity); |
60 | 9.38k | passgen_markov_leaf *leaf = calloc(1, size); |
61 | 9.38k | memset(leaf, 0, size); |
62 | 9.38k | leaf->capacity = capacity; |
63 | 9.38k | leaf->total_count = 0; |
64 | 9.38k | return leaf; |
65 | 9.38k | } |
66 | | |
67 | 9.18k | void passgen_markov_leaf_free(passgen_markov_leaf *leaf) { |
68 | 9.18k | free(leaf); |
69 | 9.18k | } |
70 | | |
71 | 30.2k | void passgen_markov_node_free(passgen_markov_node *node, size_t level) { |
72 | 4.11M | for(size_t i = 0; i < node->capacity; i++4.08M ) { |
73 | 4.08M | void *child = passgen_markov_node_child(node, i).leaf; |
74 | 4.08M | if(child) { |
75 | 39.4k | if(level < 2) { |
76 | 9.17k | passgen_markov_leaf_free(child); |
77 | 30.2k | } else { |
78 | 30.2k | passgen_markov_node_free(child, level - 1); |
79 | 30.2k | } |
80 | 39.4k | } |
81 | 4.08M | } |
82 | 30.2k | |
83 | 30.2k | free(node); |
84 | 30.2k | } |
85 | | |
86 | 0 | void passgen_markov_node_check(passgen_markov_node *node, size_t level) { |
87 | 0 | assert(node->capacity); |
88 | 0 | assert(node->capacity % 2); |
89 | 0 | for(size_t i = 0; i < node->capacity; i++) { |
90 | 0 | void *data = passgen_markov_node_child(node, i).node; |
91 | 0 | uint32_t codepoint = passgen_markov_node_codepoint(node, i); |
92 | 0 | if(codepoint) { |
93 | 0 | assert(data); |
94 | 0 | } |
95 | 0 | if(data) { |
96 | 0 | assert((codepoint % node->capacity) == i); |
97 | 0 | if(level > 2) { |
98 | 0 | passgen_markov_node_check(data, level - 1); |
99 | 0 | } |
100 | 0 | } |
101 | 0 | } |
102 | 0 | } |
103 | | |
104 | 0 | void passgen_markov_check(passgen_markov *markov) { |
105 | 0 | passgen_markov_node_check(markov->root, markov->level); |
106 | 0 | } |
107 | | |
108 | | // Initialize new markov chain with a given level |
109 | 13 | void passgen_markov_init(passgen_markov *markov, uint8_t level) { |
110 | 13 | passgen_assert(level); |
111 | 13 | memset(markov, 0, sizeof(*markov)); |
112 | 13 | markov->level = level; |
113 | 13 | markov->root = passgen_markov_node_new(0); |
114 | 13 | } |
115 | | |
116 | 198 | passgen_markov_leaf *passgen_markov_leaf_realloc(passgen_markov_leaf *leaf) { |
117 | 198 | size_t size_index = 0; |
118 | 988 | while(markov_sizes[size_index] < leaf->capacity) { |
119 | 790 | size_index++; |
120 | 790 | } |
121 | 198 | |
122 | 198 | passgen_markov_leaf *new = passgen_markov_leaf_new(size_index + 1); |
123 | 198 | |
124 | 198 | // re-insert old elements |
125 | 45.1M | for(size_t i = 0; i < leaf->capacity; i++45.1M ) { |
126 | 45.1M | if(passgen_markov_leaf_count_raw(leaf, i)) { |
127 | 193k | uint32_t codepoint = passgen_markov_leaf_codepoint_raw(leaf, i); |
128 | 193k | uint32_t count = passgen_markov_leaf_count_raw(leaf, i); |
129 | 193k | new = passgen_markov_leaf_insert(new, codepoint, count); |
130 | 193k | } |
131 | 45.1M | } |
132 | 198 | |
133 | 198 | free(leaf); |
134 | 198 | |
135 | 198 | return new; |
136 | 198 | } |
137 | | |
138 | | passgen_markov_leaf *passgen_markov_leaf_insert( |
139 | | passgen_markov_leaf *leaf, |
140 | | uint32_t codepoint, |
141 | 315k | size_t weight) { |
142 | 315k | // don't do anything if asked to insert a codepoint with an empty weight. |
143 | 315k | if(weight == 0) { |
144 | 0 | return leaf; |
145 | 0 | } |
146 | 315k | |
147 | 316k | while(315k passgen_markov_leaf_count_raw(leaf, codepoint) && |
148 | 316k | passgen_markov_leaf_codepoint_raw871 (leaf, codepoint) != codepoint871 ) { |
149 | 198 | leaf = passgen_markov_leaf_realloc(leaf); |
150 | 198 | } |
151 | 315k | |
152 | 315k | // set codepoint |
153 | 315k | passgen_markov_leaf_codepoint_raw(leaf, codepoint) = codepoint; |
154 | 315k | passgen_markov_leaf_count_raw(leaf, codepoint) += weight; |
155 | 315k | leaf->total_count += weight; |
156 | 315k | |
157 | 315k | return leaf; |
158 | 315k | } |
159 | | |
160 | 588 | passgen_markov_node *passgen_markov_node_realloc(passgen_markov_node *node) { |
161 | 588 | size_t size_index = 0; |
162 | 3.13k | while(markov_sizes[size_index] < node->capacity) { |
163 | 2.54k | size_index++; |
164 | 2.54k | } |
165 | 588 | |
166 | 588 | passgen_markov_node *new = passgen_markov_node_new(size_index + 1); |
167 | 588 | |
168 | 588 | // re-insert old elements |
169 | 3.99M | for(size_t i = 0; i < node->capacity; i++3.99M ) { |
170 | 3.99M | if(passgen_markov_node_child(node, i).node) { |
171 | 22.0k | uint32_t codepoint = passgen_markov_node_codepoint(node, i); |
172 | 22.0k | new = passgen_markov_node_insert(new, codepoint); |
173 | 22.0k | assert(!passgen_markov_node_child(new, codepoint).node); |
174 | 22.0k | passgen_markov_node_child(new, codepoint).node = |
175 | 22.0k | passgen_markov_node_child(node, i).node; |
176 | 22.0k | } |
177 | 3.99M | } |
178 | 588 | |
179 | 588 | free(node); |
180 | 588 | |
181 | 588 | return new; |
182 | 588 | } |
183 | | |
184 | | passgen_markov_node * |
185 | 83.9k | passgen_markov_node_insert(passgen_markov_node *node, uint32_t codepoint) { |
186 | 84.5k | while(passgen_markov_node_child(node, codepoint).node && |
187 | 84.5k | passgen_markov_node_codepoint22.0k (node, codepoint) != codepoint22.0k ) { |
188 | 588 | node = passgen_markov_node_realloc(node); |
189 | 588 | } |
190 | 83.9k | |
191 | 83.9k | // set codepoint |
192 | 83.9k | passgen_markov_node_codepoint(node, codepoint) = codepoint; |
193 | 83.9k | |
194 | 83.9k | return node; |
195 | 83.9k | } |
196 | | |
197 | | passgen_markov_node *passgen_markov_node_insert_word( |
198 | | passgen_markov_node *node, |
199 | | const uint32_t *sequence, |
200 | | size_t length, |
201 | 60.8k | size_t weight) { |
202 | 60.8k | uint32_t codepoint = sequence[0]; |
203 | 60.8k | node = passgen_markov_node_insert(node, codepoint); |
204 | 60.8k | |
205 | 60.8k | if(length == 2) { |
206 | 11.2k | passgen_markov_leaf *child = |
207 | 11.2k | passgen_markov_node_child(node, codepoint).leaf; |
208 | 11.2k | |
209 | 11.2k | if(!child) { |
210 | 9.17k | child = passgen_markov_leaf_new(0); |
211 | 9.17k | } |
212 | 11.2k | |
213 | 11.2k | child = passgen_markov_leaf_insert(child, sequence[1], weight); |
214 | 11.2k | |
215 | 11.2k | passgen_markov_node_child(node, codepoint).leaf = child; |
216 | 49.6k | } else { |
217 | 49.6k | // retrieve or create child node |
218 | 49.6k | passgen_markov_node *child = |
219 | 49.6k | passgen_markov_node_child(node, codepoint).node; |
220 | 49.6k | if(!child) { |
221 | 30.2k | child = passgen_markov_node_new(0); |
222 | 30.2k | } |
223 | 49.6k | |
224 | 49.6k | // insert word into child |
225 | 49.6k | child = passgen_markov_node_insert_word( |
226 | 49.6k | child, |
227 | 49.6k | &sequence[1], |
228 | 49.6k | length - 1, |
229 | 49.6k | weight); |
230 | 49.6k | |
231 | 49.6k | // save child |
232 | 49.6k | passgen_markov_node_child(node, codepoint).node = child; |
233 | 49.6k | } |
234 | 60.8k | |
235 | 60.8k | return node; |
236 | 60.8k | } |
237 | | |
238 | | void passgen_markov_insert( |
239 | | passgen_markov *markov, |
240 | | const uint32_t *sequence, |
241 | 11.2k | size_t weight) { |
242 | 11.2k | markov->root = passgen_markov_node_insert_word( |
243 | 11.2k | markov->root, |
244 | 11.2k | sequence, |
245 | 11.2k | markov->level + 1, |
246 | 11.2k | weight); |
247 | 11.2k | markov->count += weight; |
248 | 11.2k | } |
249 | | |
250 | | // Add a word to a markov chain |
251 | | void passgen_markov_add( |
252 | | passgen_markov *markov, |
253 | | const uint32_t *word, |
254 | | size_t word_len, |
255 | 1.39k | size_t weight) { |
256 | 1.39k | // insert start sequences |
257 | 1.39k | size_t sequence_len = 2 * markov->level + 1; |
258 | 1.39k | uint32_t sequence[sequence_len]; |
259 | 1.39k | memset(sequence, 0, sizeof(sequence[0]) * sequence_len); |
260 | 1.39k | memcpy( |
261 | 1.39k | &sequence[markov->level], |
262 | 1.39k | word, |
263 | 1.39k | sizeof(uint32_t) * MIN(markov->level, word_len)); |
264 | 6.97k | for(size_t i = 0; i < MIN(markov->level, word_len); i++5.58k ) { |
265 | 5.58k | passgen_markov_insert(markov, &sequence[i], weight); |
266 | 5.58k | } |
267 | 5.62k | for(size_t i = 0; i < SATURATING_SUB(word_len, markov->level); i++4.23k ) { |
268 | 4.23k | passgen_markov_insert(markov, &word[i], weight); |
269 | 4.23k | } |
270 | 1.39k | memset(sequence, 0, sizeof(sequence[0]) * sequence_len); |
271 | 1.39k | memcpy( |
272 | 1.39k | &sequence[SATURATING_SUB(markov->level, word_len)], |
273 | 1.39k | &word[SATURATING_SUB(word_len, markov->level)], |
274 | 1.39k | sizeof(uint32_t) * MIN(markov->level, word_len)); |
275 | 1.39k | passgen_markov_insert(markov, &sequence[0], weight); |
276 | 1.39k | } |
277 | | |
278 | | // Free a markov chain |
279 | 13 | void passgen_markov_free(passgen_markov *markov) { |
280 | 13 | passgen_markov_node_free(markov->root, markov->level); |
281 | 13 | } |
282 | | |
283 | | uint32_t passgen_markov_generate( |
284 | | passgen_markov *markov, |
285 | | const uint32_t *current, |
286 | | passgen_random *random, |
287 | 0 | double *entropy) { |
288 | 0 | passgen_markov_node *node = markov->root; |
289 | 0 |
|
290 | 0 | for(size_t i = 0; i < markov->level; i++) { |
291 | 0 | node = |
292 | 0 | (passgen_markov_node *) passgen_markov_node_child(node, current[i]) |
293 | 0 | .leaf; |
294 | 0 | assert(node); |
295 | 0 | } |
296 | 0 |
|
297 | 0 | passgen_markov_leaf *leaf = (passgen_markov_leaf *) node; |
298 | 0 |
|
299 | 0 | // record total choices. |
300 | 0 | if(entropy) { |
301 | 0 | *entropy *= (double) leaf->total_count; |
302 | 0 | } |
303 | 0 |
|
304 | 0 | size_t choice = passgen_random_u64_max(random, leaf->total_count); |
305 | 0 | for(size_t i = 0; i < leaf->capacity; i++) { |
306 | 0 | uint32_t count = passgen_markov_leaf_count_raw(leaf, i); |
307 | 0 | if(count) { |
308 | 0 | if(choice < count) { |
309 | 0 | // record bucket (reduces total choices). |
310 | 0 | if(entropy) { |
311 | 0 | *entropy /= (double) count; |
312 | 0 | } |
313 | 0 |
|
314 | 0 | return passgen_markov_leaf_codepoint_raw(leaf, i); |
315 | 0 | } else { |
316 | 0 | choice -= count; |
317 | 0 | } |
318 | 0 | } |
319 | 0 | } |
320 | 0 |
|
321 | 0 | assert(false); |
322 | 0 |
|
323 | 0 | return 0; |
324 | 0 | } |
325 | | |
326 | | uint32_t |
327 | 221k | passgen_markov_leaf_count(passgen_markov_leaf *leaf, uint32_t codepoint) { |
328 | 221k | if(passgen_markov_leaf_codepoint_raw(leaf, codepoint) == codepoint) { |
329 | 110k | return passgen_markov_leaf_count_raw(leaf, codepoint); |
330 | 110k | } else { |
331 | 110k | return 0; |
332 | 110k | } |
333 | 221k | } |