/builds/passgen/passgen-c/src/markov.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "passgen/markov.h" |
2 | | #include "passgen/assert.h" |
3 | | #include "passgen/config.h" |
4 | | #include <assert.h> |
5 | | #include <stdlib.h> |
6 | | #include <string.h> |
7 | 48.7k | #define MIN(a, b) ((a) < (b) ? (a)22.4k : (b)26.3k ) |
8 | 26.7k | #define SATURATING_SUB(a, b) ((a) -MIN(a, b)) |
9 | | |
10 | | #define MARKOV_LEAF_INITIAL 4 |
11 | | #define MARKOV_LEAF_MULTIPLIER 2 |
12 | | |
13 | | #define passgen_markov_leaf_count_raw(leaf, codepoint) \ |
14 | 47.1M | leaf->entries[leaf->capacity].count[codepoint % leaf->capacity] |
15 | | |
16 | | #define passgen_markov_leaf_codepoint_raw(leaf, index) \ |
17 | 730k | leaf->entries[0].codepoint[index % leaf->capacity] |
18 | | |
19 | | /// Sizes for the leaf and node hash maps. These are all prime numbers, with the |
20 | | /// property that every number is greater than double the previous one. |
21 | | /// Generated with Ruby with: |
22 | | /// |
23 | | /// ```ruby |
24 | | /// require 'prime' |
25 | | /// p Prime.take(10000).inject([3]){|l, p| l << p if (2*l.last) < p; l} |
26 | | /// ``` |
27 | | static const size_t markov_sizes[] = { |
28 | | 3, 7, 17, 37, 79, 163, 331, |
29 | | 673, 1361, 2729, 5471, 10949, 21911, 43853, |
30 | | 87719, 175447, 350899, 701819, 1403641, 2807303, 5614657, |
31 | | 11229331, 22458671, 44917381, 0}; |
32 | | |
33 | 81.7k | size_t passgen_markov_node_size(size_t capacity) { |
34 | 81.7k | // we use capacity + 1 there to make sure it's a round number, such that the |
35 | 81.7k | // child pointers are not misaligned. |
36 | 81.7k | return sizeof(passgen_markov_node) + (capacity + 1) * sizeof(uint32_t) + |
37 | 81.7k | capacity * sizeof(passgen_markov_node *); |
38 | 81.7k | } |
39 | | |
40 | 29.8k | size_t passgen_markov_leaf_size(size_t capacity) { |
41 | 29.8k | return sizeof(passgen_markov_leaf) + capacity * sizeof(uint32_t) + |
42 | 29.8k | capacity * sizeof(uint32_t); |
43 | 29.8k | } |
44 | | |
45 | 81.7k | passgen_markov_node *passgen_markov_node_new(size_t size_index) { |
46 | 81.7k | size_t capacity = markov_sizes[size_index]; |
47 | 81.7k | passgen_assert(capacity); |
48 | 81.7k | size_t size = passgen_markov_node_size(capacity); |
49 | 81.7k | passgen_markov_node *node = calloc(1, size); |
50 | 81.7k | memset(node, 0, size); |
51 | 81.7k | node->capacity = capacity; |
52 | 81.7k | return node; |
53 | 81.7k | } |
54 | | |
55 | 29.8k | passgen_markov_leaf *passgen_markov_leaf_new(size_t size_index) { |
56 | 29.8k | size_t capacity = markov_sizes[size_index]; |
57 | 29.8k | if(capacity == 0) { |
58 | 0 | return NULL; |
59 | 0 | } |
60 | 29.8k | size_t size = passgen_markov_leaf_size(capacity); |
61 | 29.8k | passgen_markov_leaf *leaf = calloc(1, size); |
62 | 29.8k | memset(leaf, 0, size); |
63 | 29.8k | leaf->capacity = capacity; |
64 | 29.8k | leaf->total_count = 0; |
65 | 29.8k | return leaf; |
66 | 29.8k | } |
67 | | |
68 | 29.5k | void passgen_markov_leaf_free(passgen_markov_leaf *leaf) { |
69 | 29.5k | PASSGEN_CLEAR(leaf); |
70 | 29.5k | free(leaf); |
71 | 29.5k | } |
72 | | |
73 | 80.8k | void passgen_markov_node_free(passgen_markov_node *node, size_t level) { |
74 | 7.38M | for(size_t i = 0; i < node->capacity; i++7.30M ) { |
75 | 7.30M | void *child = passgen_markov_node_child(node, i).leaf; |
76 | 7.30M | if(child) { |
77 | 110k | if(level < 2) { |
78 | 29.5k | passgen_markov_leaf_free(child); |
79 | 80.8k | } else { |
80 | 80.8k | passgen_markov_node_free(child, level - 1); |
81 | 80.8k | } |
82 | 110k | } |
83 | 7.30M | } |
84 | 80.8k | |
85 | 80.8k | PASSGEN_CLEAR(node); |
86 | 80.8k | free(node); |
87 | 80.8k | } |
88 | | |
89 | 0 | void passgen_markov_node_check(passgen_markov_node *node, size_t level) { |
90 | 0 | assert(node->capacity); |
91 | 0 | assert(node->capacity % 2); |
92 | 0 | for(size_t i = 0; i < node->capacity; i++) { |
93 | 0 | void *data = passgen_markov_node_child(node, i).node; |
94 | 0 | uint32_t codepoint = passgen_markov_node_codepoint(node, i); |
95 | 0 | if(codepoint) { |
96 | 0 | assert(data); |
97 | 0 | } |
98 | 0 | if(data) { |
99 | 0 | assert((codepoint % node->capacity) == i); |
100 | 0 | if(level > 2) { |
101 | 0 | passgen_markov_node_check(data, level - 1); |
102 | 0 | } |
103 | 0 | } |
104 | 0 | } |
105 | 0 | } |
106 | | |
107 | 0 | void passgen_markov_check(passgen_markov *markov) { |
108 | 0 | passgen_markov_node_check(markov->root, markov->level); |
109 | 0 | } |
110 | | |
111 | | // Initialize new markov chain with a given level |
112 | 14 | void passgen_markov_init(passgen_markov *markov, uint8_t level) { |
113 | 14 | passgen_assert(level); |
114 | 14 | memset(markov, 0, sizeof(*markov)); |
115 | 14 | markov->level = level; |
116 | 14 | markov->root = passgen_markov_node_new(0); |
117 | 14 | } |
118 | | |
119 | 225 | passgen_markov_leaf *passgen_markov_leaf_realloc(passgen_markov_leaf *leaf) { |
120 | 225 | size_t size_index = 0; |
121 | 1.14k | while(markov_sizes[size_index] < leaf->capacity) { |
122 | 924 | size_index++; |
123 | 924 | } |
124 | 225 | |
125 | 225 | passgen_markov_leaf *new = passgen_markov_leaf_new(size_index + 1); |
126 | 225 | |
127 | 225 | // re-insert old elements |
128 | 1.37M | for(size_t i = 0; i < leaf->capacity; i++1.37M ) { |
129 | 1.37M | if(passgen_markov_leaf_count_raw(leaf, i)) { |
130 | 184k | uint32_t codepoint = passgen_markov_leaf_codepoint_raw(leaf, i); |
131 | 184k | uint32_t count = passgen_markov_leaf_count_raw(leaf, i); |
132 | 184k | new = passgen_markov_leaf_insert(new, codepoint, count); |
133 | 184k | } |
134 | 1.37M | } |
135 | 225 | |
136 | 225 | free(leaf); |
137 | 225 | |
138 | 225 | return new; |
139 | 225 | } |
140 | | |
141 | | passgen_markov_leaf *passgen_markov_leaf_insert( |
142 | | passgen_markov_leaf *leaf, |
143 | | uint32_t codepoint, |
144 | 319k | size_t weight) { |
145 | 319k | // don't do anything if asked to insert a codepoint with an empty weight. |
146 | 319k | if(weight == 0) { |
147 | 0 | return leaf; |
148 | 0 | } |
149 | 319k | |
150 | 319k | while(319k passgen_markov_leaf_count_raw(leaf, codepoint) && |
151 | 319k | passgen_markov_leaf_codepoint_raw891 (leaf, codepoint) != codepoint891 ) { |
152 | 225 | leaf = passgen_markov_leaf_realloc(leaf); |
153 | 225 | } |
154 | 319k | |
155 | 319k | // set codepoint |
156 | 319k | passgen_markov_leaf_codepoint_raw(leaf, codepoint) = codepoint; |
157 | 319k | passgen_markov_leaf_count_raw(leaf, codepoint) += weight; |
158 | 319k | leaf->total_count += weight; |
159 | 319k | |
160 | 319k | return leaf; |
161 | 319k | } |
162 | | |
163 | 913 | passgen_markov_node *passgen_markov_node_realloc(passgen_markov_node *node) { |
164 | 913 | size_t size_index = 0; |
165 | 4.69k | while(markov_sizes[size_index] < node->capacity) { |
166 | 3.78k | size_index++; |
167 | 3.78k | } |
168 | 913 | |
169 | 913 | passgen_markov_node *new = passgen_markov_node_new(size_index + 1); |
170 | 913 | |
171 | 913 | // re-insert old elements |
172 | 7.06M | for(size_t i = 0; i < node->capacity; i++7.05M ) { |
173 | 7.05M | if(passgen_markov_node_child(node, i).node) { |
174 | 37.7k | uint32_t codepoint = passgen_markov_node_codepoint(node, i); |
175 | 37.7k | new = passgen_markov_node_insert(new, codepoint); |
176 | 37.7k | assert(!passgen_markov_node_child(new, codepoint).node); |
177 | 37.7k | passgen_markov_node_child(new, codepoint).node = |
178 | 37.7k | passgen_markov_node_child(node, i).node; |
179 | 37.7k | } |
180 | 7.05M | } |
181 | 913 | |
182 | 913 | free(node); |
183 | 913 | |
184 | 913 | return new; |
185 | 913 | } |
186 | | |
187 | | passgen_markov_node * |
188 | 193k | passgen_markov_node_insert(passgen_markov_node *node, uint32_t codepoint) { |
189 | 194k | while(passgen_markov_node_child(node, codepoint).node && |
190 | 194k | passgen_markov_node_codepoint45.0k (node, codepoint) != codepoint45.0k ) { |
191 | 913 | node = passgen_markov_node_realloc(node); |
192 | 913 | } |
193 | 193k | |
194 | 193k | // set codepoint |
195 | 193k | passgen_markov_node_codepoint(node, codepoint) = codepoint; |
196 | 193k | |
197 | 193k | return node; |
198 | 193k | } |
199 | | |
200 | | passgen_markov_node *passgen_markov_node_insert_word( |
201 | | passgen_markov_node *node, |
202 | | const uint32_t *sequence, |
203 | | size_t length, |
204 | 154k | size_t weight) { |
205 | 154k | uint32_t codepoint = sequence[0]; |
206 | 154k | node = passgen_markov_node_insert(node, codepoint); |
207 | 154k | |
208 | 154k | if(length == 2) { |
209 | 33.3k | passgen_markov_leaf *child = |
210 | 33.3k | passgen_markov_node_child(node, codepoint).leaf; |
211 | 33.3k | |
212 | 33.3k | if(!child) { |
213 | 29.5k | child = passgen_markov_leaf_new(0); |
214 | 29.5k | } |
215 | 33.3k | |
216 | 33.3k | child = passgen_markov_leaf_insert(child, sequence[1], weight); |
217 | 33.3k | |
218 | 33.3k | passgen_markov_node_child(node, codepoint).leaf = child; |
219 | 121k | } else { |
220 | 121k | // retrieve or create child node |
221 | 121k | passgen_markov_node *child = |
222 | 121k | passgen_markov_node_child(node, codepoint).node; |
223 | 121k | if(!child) { |
224 | 80.8k | child = passgen_markov_node_new(0); |
225 | 80.8k | } |
226 | 121k | |
227 | 121k | // insert word into child |
228 | 121k | child = passgen_markov_node_insert_word( |
229 | 121k | child, |
230 | 121k | &sequence[1], |
231 | 121k | length - 1, |
232 | 121k | weight); |
233 | 121k | |
234 | 121k | // save child |
235 | 121k | passgen_markov_node_child(node, codepoint).node = child; |
236 | 121k | } |
237 | 154k | |
238 | 154k | return node; |
239 | 154k | } |
240 | | |
241 | | void passgen_markov_insert( |
242 | | passgen_markov *markov, |
243 | | const uint32_t *sequence, |
244 | 33.3k | size_t weight) { |
245 | 33.3k | markov->root = passgen_markov_node_insert_word( |
246 | 33.3k | markov->root, |
247 | 33.3k | sequence, |
248 | 33.3k | markov->level + 1, |
249 | 33.3k | weight); |
250 | 33.3k | markov->count += weight; |
251 | 33.3k | } |
252 | | |
253 | | // Add a word to a markov chain |
254 | | void passgen_markov_add( |
255 | | passgen_markov *markov, |
256 | | const uint32_t *word, |
257 | | size_t word_len, |
258 | 3.09k | size_t weight) { |
259 | 3.09k | // insert start sequences |
260 | 3.09k | size_t sequence_len = 2 * markov->level + 1; |
261 | 3.09k | uint32_t sequence[sequence_len]; |
262 | 3.09k | memset(sequence, 0, sizeof(sequence[0]) * sequence_len); |
263 | 3.09k | memcpy( |
264 | 3.09k | &sequence[markov->level], |
265 | 3.09k | word, |
266 | 3.09k | sizeof(uint32_t) * MIN(markov->level, word_len)); |
267 | 15.8k | for(size_t i = 0; i < MIN(markov->level, word_len); i++12.7k ) { |
268 | 12.7k | passgen_markov_insert(markov, &sequence[i], weight); |
269 | 12.7k | } |
270 | 20.5k | for(size_t i = 0; i < SATURATING_SUB(word_len, markov->level); i++17.4k ) { |
271 | 17.4k | passgen_markov_insert(markov, &word[i], weight); |
272 | 17.4k | } |
273 | 3.09k | memset(sequence, 0, sizeof(sequence[0]) * sequence_len); |
274 | 3.09k | memcpy( |
275 | 3.09k | &sequence[SATURATING_SUB(markov->level, word_len)], |
276 | 3.09k | &word[SATURATING_SUB(word_len, markov->level)], |
277 | 3.09k | sizeof(uint32_t) * MIN(markov->level, word_len)); |
278 | 3.09k | passgen_markov_insert(markov, &sequence[0], weight); |
279 | 3.09k | } |
280 | | |
281 | | // Free a markov chain |
282 | 14 | void passgen_markov_free(passgen_markov *markov) { |
283 | 14 | passgen_markov_node_free(markov->root, markov->level); |
284 | 14 | PASSGEN_CLEAR(markov); |
285 | 14 | } |
286 | | |
287 | | uint32_t passgen_markov_generate( |
288 | | passgen_markov *markov, |
289 | | const uint32_t *current, |
290 | | passgen_random *random, |
291 | 13.0k | double *entropy) { |
292 | 13.0k | passgen_markov_node *node = markov->root; |
293 | 13.0k | |
294 | 52.0k | for(size_t i = 0; i < markov->level; i++39.0k ) { |
295 | 39.0k | node = |
296 | 39.0k | (passgen_markov_node *) passgen_markov_node_child(node, current[i]) |
297 | 39.0k | .leaf; |
298 | 39.0k | assert(node); |
299 | 39.0k | } |
300 | 13.0k | |
301 | 13.0k | passgen_markov_leaf *leaf = (passgen_markov_leaf *) node; |
302 | 13.0k | |
303 | 13.0k | // record total choices. |
304 | 13.0k | if(entropy) { |
305 | 0 | *entropy *= (double) leaf->total_count; |
306 | 0 | } |
307 | 13.0k | |
308 | 13.0k | size_t choice = passgen_random_u64_max(random, leaf->total_count); |
309 | 43.1M | for(size_t i = 0; i < leaf->capacity; i++43.1M ) { |
310 | 43.1M | uint32_t count = passgen_markov_leaf_count_raw(leaf, i); |
311 | 43.1M | if(count) { |
312 | 512k | if(choice < count) { |
313 | 13.0k | // record bucket (reduces total choices). |
314 | 13.0k | if(entropy) { |
315 | 0 | *entropy /= (double) count; |
316 | 0 | } |
317 | 13.0k | |
318 | 13.0k | return passgen_markov_leaf_codepoint_raw(leaf, i); |
319 | 499k | } else { |
320 | 499k | choice -= count; |
321 | 499k | } |
322 | 512k | } |
323 | 43.1M | } |
324 | 13.0k | |
325 | 13.0k | assert(false); |
326 | 0 |
|
327 | 0 | return 0; |
328 | 0 | } |
329 | | |
330 | | uint32_t |
331 | 212k | passgen_markov_leaf_count(passgen_markov_leaf *leaf, uint32_t codepoint) { |
332 | 212k | if(passgen_markov_leaf_codepoint_raw(leaf, codepoint) == codepoint) { |
333 | 101k | return passgen_markov_leaf_count_raw(leaf, codepoint); |
334 | 110k | } else { |
335 | 110k | return 0; |
336 | 110k | } |
337 | 212k | } |