Line data Source code
1 : #include "passgen/markov.h"
2 : #include "passgen/assert.h"
3 : #include <assert.h>
4 : #include <stdlib.h>
5 : #include <string.h>
6 : #define MIN(a, b) ((a) < (b) ? (a) : (b))
7 : #define SATURATING_SUB(a, b) ((a) -MIN(a, b))
8 :
9 : #define MARKOV_LEAF_INITIAL 4
10 : #define MARKOV_LEAF_MULTIPLIER 2
11 :
12 : #define passgen_markov_leaf_count_raw(leaf, codepoint) \
13 : leaf->entries[leaf->capacity].count[codepoint % leaf->capacity]
14 :
15 : #define passgen_markov_leaf_codepoint_raw(leaf, index) \
16 : leaf->entries[0].codepoint[index % leaf->capacity]
17 :
18 : /// Sizes for the leaf and node hash maps. These are all prime numbers, with the
19 : /// property that every number is greater than double the previous one.
20 : /// Generated with Ruby with:
21 : ///
22 : /// ```ruby
23 : /// require 'prime'
24 : /// p Prime.take(10000).inject([3]){|l, p| l << p if (2*l.last) < p; l}
25 : /// ```
26 : static const size_t markov_sizes[] = {
27 : 3, 7, 17, 37, 79, 163, 331,
28 : 673, 1361, 2729, 5471, 10949, 21911, 43853,
29 : 87719, 175447, 350899, 701819, 1403641, 2807303, 5614657,
30 : 11229331, 22458671, 44917381, 0};
31 :
32 30827 : size_t passgen_markov_node_size(size_t capacity) {
33 : // we use capacity + 1 there to make sure it's a round number, such that the
34 : // child pointers are not misaligned.
35 30827 : return sizeof(passgen_markov_node) + (capacity + 1) * sizeof(uint32_t) +
36 : capacity * sizeof(passgen_markov_node *);
37 : }
38 :
39 9380 : size_t passgen_markov_leaf_size(size_t capacity) {
40 9380 : return sizeof(passgen_markov_leaf) + capacity * sizeof(uint32_t) +
41 : capacity * sizeof(uint32_t);
42 : }
43 :
44 30827 : passgen_markov_node *passgen_markov_node_new(size_t size_index) {
45 30827 : size_t capacity = markov_sizes[size_index];
46 30827 : passgen_assert(capacity);
47 30827 : size_t size = passgen_markov_node_size(capacity);
48 30827 : passgen_markov_node *node = calloc(1, size);
49 30827 : memset(node, 0, size);
50 30827 : node->capacity = capacity;
51 30827 : return node;
52 : }
53 :
54 9380 : passgen_markov_leaf *passgen_markov_leaf_new(size_t size_index) {
55 9380 : size_t capacity = markov_sizes[size_index];
56 9380 : if(capacity == 0) {
57 0 : return NULL;
58 : }
59 9380 : size_t size = passgen_markov_leaf_size(capacity);
60 9380 : passgen_markov_leaf *leaf = calloc(1, size);
61 9380 : memset(leaf, 0, size);
62 9380 : leaf->capacity = capacity;
63 9380 : leaf->total_count = 0;
64 9380 : return leaf;
65 : }
66 :
67 9180 : void passgen_markov_leaf_free(passgen_markov_leaf *leaf) {
68 9180 : free(leaf);
69 9180 : }
70 :
71 30237 : void passgen_markov_node_free(passgen_markov_node *node, size_t level) {
72 4117450 : for(size_t i = 0; i < node->capacity; i++) {
73 4087213 : void *child = passgen_markov_node_child(node, i).leaf;
74 4087213 : if(child) {
75 39400 : if(level < 2) {
76 9177 : passgen_markov_leaf_free(child);
77 : } else {
78 30223 : passgen_markov_node_free(child, level - 1);
79 : }
80 : }
81 : }
82 :
83 30237 : free(node);
84 30237 : }
85 :
86 0 : void passgen_markov_node_check(passgen_markov_node *node, size_t level) {
87 0 : assert(node->capacity);
88 0 : assert(node->capacity % 2);
89 0 : for(size_t i = 0; i < node->capacity; i++) {
90 0 : void *data = passgen_markov_node_child(node, i).node;
91 0 : uint32_t codepoint = passgen_markov_node_codepoint(node, i);
92 0 : if(codepoint) {
93 0 : assert(data);
94 : }
95 0 : if(data) {
96 0 : assert((codepoint % node->capacity) == i);
97 0 : if(level > 2) {
98 0 : passgen_markov_node_check(data, level - 1);
99 : }
100 : }
101 : }
102 0 : }
103 :
104 0 : void passgen_markov_check(passgen_markov *markov) {
105 0 : passgen_markov_node_check(markov->root, markov->level);
106 0 : }
107 :
108 : // Initialize new markov chain with a given level
109 13 : void passgen_markov_init(passgen_markov *markov, uint8_t level) {
110 13 : passgen_assert(level);
111 13 : memset(markov, 0, sizeof(*markov));
112 13 : markov->level = level;
113 13 : markov->root = passgen_markov_node_new(0);
114 13 : }
115 :
116 198 : passgen_markov_leaf *passgen_markov_leaf_realloc(passgen_markov_leaf *leaf) {
117 198 : size_t size_index = 0;
118 988 : while(markov_sizes[size_index] < leaf->capacity) {
119 790 : size_index++;
120 : }
121 :
122 198 : passgen_markov_leaf *new = passgen_markov_leaf_new(size_index + 1);
123 :
124 : // re-insert old elements
125 45166668 : for(size_t i = 0; i < leaf->capacity; i++) {
126 45166470 : if(passgen_markov_leaf_count_raw(leaf, i)) {
127 193682 : uint32_t codepoint = passgen_markov_leaf_codepoint_raw(leaf, i);
128 193682 : uint32_t count = passgen_markov_leaf_count_raw(leaf, i);
129 193682 : new = passgen_markov_leaf_insert(new, codepoint, count);
130 : }
131 : }
132 :
133 198 : free(leaf);
134 :
135 198 : return new;
136 : }
137 :
138 315891 : passgen_markov_leaf *passgen_markov_leaf_insert(
139 : passgen_markov_leaf *leaf,
140 : uint32_t codepoint,
141 : size_t weight) {
142 : // don't do anything if asked to insert a codepoint with an empty weight.
143 315891 : if(weight == 0) {
144 0 : return leaf;
145 : }
146 :
147 316089 : while(passgen_markov_leaf_count_raw(leaf, codepoint) &&
148 871 : passgen_markov_leaf_codepoint_raw(leaf, codepoint) != codepoint) {
149 198 : leaf = passgen_markov_leaf_realloc(leaf);
150 : }
151 :
152 : // set codepoint
153 315891 : passgen_markov_leaf_codepoint_raw(leaf, codepoint) = codepoint;
154 315891 : passgen_markov_leaf_count_raw(leaf, codepoint) += weight;
155 315891 : leaf->total_count += weight;
156 :
157 315891 : return leaf;
158 : }
159 :
160 588 : passgen_markov_node *passgen_markov_node_realloc(passgen_markov_node *node) {
161 588 : size_t size_index = 0;
162 3134 : while(markov_sizes[size_index] < node->capacity) {
163 2546 : size_index++;
164 : }
165 :
166 588 : passgen_markov_node *new = passgen_markov_node_new(size_index + 1);
167 :
168 : // re-insert old elements
169 3994742 : for(size_t i = 0; i < node->capacity; i++) {
170 3994154 : if(passgen_markov_node_child(node, i).node) {
171 22089 : uint32_t codepoint = passgen_markov_node_codepoint(node, i);
172 22089 : new = passgen_markov_node_insert(new, codepoint);
173 22089 : assert(!passgen_markov_node_child(new, codepoint).node);
174 22089 : passgen_markov_node_child(new, codepoint).node =
175 22089 : passgen_markov_node_child(node, i).node;
176 : }
177 : }
178 :
179 588 : free(node);
180 :
181 588 : return new;
182 : }
183 :
184 : passgen_markov_node *
185 83981 : passgen_markov_node_insert(passgen_markov_node *node, uint32_t codepoint) {
186 84569 : while(passgen_markov_node_child(node, codepoint).node &&
187 22080 : passgen_markov_node_codepoint(node, codepoint) != codepoint) {
188 588 : node = passgen_markov_node_realloc(node);
189 : }
190 :
191 : // set codepoint
192 83981 : passgen_markov_node_codepoint(node, codepoint) = codepoint;
193 :
194 83981 : return node;
195 : }
196 :
197 60892 : passgen_markov_node *passgen_markov_node_insert_word(
198 : passgen_markov_node *node,
199 : const uint32_t *sequence,
200 : size_t length,
201 : size_t weight) {
202 60892 : uint32_t codepoint = sequence[0];
203 60892 : node = passgen_markov_node_insert(node, codepoint);
204 :
205 60892 : if(length == 2) {
206 11208 : passgen_markov_leaf *child =
207 11208 : passgen_markov_node_child(node, codepoint).leaf;
208 :
209 11208 : if(!child) {
210 9177 : child = passgen_markov_leaf_new(0);
211 : }
212 :
213 11208 : child = passgen_markov_leaf_insert(child, sequence[1], weight);
214 :
215 11208 : passgen_markov_node_child(node, codepoint).leaf = child;
216 : } else {
217 : // retrieve or create child node
218 49684 : passgen_markov_node *child =
219 49684 : passgen_markov_node_child(node, codepoint).node;
220 49684 : if(!child) {
221 30223 : child = passgen_markov_node_new(0);
222 : }
223 :
224 : // insert word into child
225 49684 : child = passgen_markov_node_insert_word(
226 : child,
227 : &sequence[1],
228 : length - 1,
229 : weight);
230 :
231 : // save child
232 49684 : passgen_markov_node_child(node, codepoint).node = child;
233 : }
234 :
235 60892 : return node;
236 : }
237 :
238 11207 : void passgen_markov_insert(
239 : passgen_markov *markov,
240 : const uint32_t *sequence,
241 : size_t weight) {
242 22414 : markov->root = passgen_markov_node_insert_word(
243 : markov->root,
244 : sequence,
245 11207 : markov->level + 1,
246 : weight);
247 11207 : markov->count += weight;
248 11207 : }
249 :
250 : // Add a word to a markov chain
251 1391 : void passgen_markov_add(
252 : passgen_markov *markov,
253 : const uint32_t *word,
254 : size_t word_len,
255 1391 : size_t weight) {
256 : // insert start sequences
257 1391 : size_t sequence_len = 2 * markov->level + 1;
258 1391 : uint32_t sequence[sequence_len];
259 1391 : memset(sequence, 0, sizeof(sequence[0]) * sequence_len);
260 1391 : memcpy(
261 1391 : &sequence[markov->level],
262 : word,
263 1391 : sizeof(uint32_t) * MIN(markov->level, word_len));
264 6977 : for(size_t i = 0; i < MIN(markov->level, word_len); i++) {
265 5586 : passgen_markov_insert(markov, &sequence[i], weight);
266 : }
267 5621 : for(size_t i = 0; i < SATURATING_SUB(word_len, markov->level); i++) {
268 4230 : passgen_markov_insert(markov, &word[i], weight);
269 : }
270 1391 : memset(sequence, 0, sizeof(sequence[0]) * sequence_len);
271 1391 : memcpy(
272 1391 : &sequence[SATURATING_SUB(markov->level, word_len)],
273 1391 : &word[SATURATING_SUB(word_len, markov->level)],
274 1391 : sizeof(uint32_t) * MIN(markov->level, word_len));
275 1391 : passgen_markov_insert(markov, &sequence[0], weight);
276 1391 : }
277 :
278 : // Free a markov chain
279 13 : void passgen_markov_free(passgen_markov *markov) {
280 13 : passgen_markov_node_free(markov->root, markov->level);
281 13 : }
282 :
283 0 : uint32_t passgen_markov_generate(
284 : passgen_markov *markov,
285 : const uint32_t *current,
286 : passgen_random *random,
287 : double *entropy) {
288 0 : passgen_markov_node *node = markov->root;
289 :
290 0 : for(size_t i = 0; i < markov->level; i++) {
291 0 : node =
292 0 : (passgen_markov_node *) passgen_markov_node_child(node, current[i])
293 : .leaf;
294 0 : assert(node);
295 : }
296 :
297 0 : passgen_markov_leaf *leaf = (passgen_markov_leaf *) node;
298 :
299 : // record total choices.
300 0 : if(entropy) {
301 0 : *entropy *= (double) leaf->total_count;
302 : }
303 :
304 0 : size_t choice = passgen_random_u64_max(random, leaf->total_count);
305 0 : for(size_t i = 0; i < leaf->capacity; i++) {
306 0 : uint32_t count = passgen_markov_leaf_count_raw(leaf, i);
307 0 : if(count) {
308 0 : if(choice < count) {
309 : // record bucket (reduces total choices).
310 0 : if(entropy) {
311 0 : *entropy /= (double) count;
312 : }
313 :
314 0 : return passgen_markov_leaf_codepoint_raw(leaf, i);
315 : } else {
316 0 : choice -= count;
317 : }
318 : }
319 : }
320 :
321 0 : assert(false);
322 :
323 : return 0;
324 : }
325 :
326 : uint32_t
327 221000 : passgen_markov_leaf_count(passgen_markov_leaf *leaf, uint32_t codepoint) {
328 221000 : if(passgen_markov_leaf_codepoint_raw(leaf, codepoint) == codepoint) {
329 110021 : return passgen_markov_leaf_count_raw(leaf, codepoint);
330 : } else {
331 110979 : return 0;
332 : }
333 : }
|