Line data Source code
1 : #include "passgen/markov.h"
2 : #include "passgen/assert.h"
3 : #include "passgen/config.h"
4 : #include <assert.h>
5 : #include <stdlib.h>
6 : #include <string.h>
7 : #define MIN(a, b) ((a) < (b) ? (a) : (b))
8 : #define SATURATING_SUB(a, b) ((a) -MIN(a, b))
9 :
10 : #define MARKOV_LEAF_INITIAL 4
11 : #define MARKOV_LEAF_MULTIPLIER 2
12 :
13 : #define passgen_markov_leaf_count_raw(leaf, codepoint) \
14 : leaf->entries[leaf->capacity].count[codepoint % leaf->capacity]
15 :
16 : #define passgen_markov_leaf_codepoint_raw(leaf, index) \
17 : leaf->entries[0].codepoint[index % leaf->capacity]
18 :
19 : /// Sizes for the leaf and node hash maps. These are all prime numbers, with the
20 : /// property that every number is greater than double the previous one.
21 : /// Generated with Ruby with:
22 : ///
23 : /// ```ruby
24 : /// require 'prime'
25 : /// p Prime.take(10000).inject([3]){|l, p| l << p if (2*l.last) < p; l}
26 : /// ```
27 : static const size_t markov_sizes[] = {
28 : 3, 7, 17, 37, 79, 163, 331,
29 : 673, 1361, 2729, 5471, 10949, 21911, 43853,
30 : 87719, 175447, 350899, 701819, 1403641, 2807303, 5614657,
31 : 11229331, 22458671, 44917381, 0};
32 :
33 81735 : size_t passgen_markov_node_size(size_t capacity) {
34 : // we use capacity + 1 there to make sure it's a round number, such that the
35 : // child pointers are not misaligned.
36 81735 : return sizeof(passgen_markov_node) + (capacity + 1) * sizeof(uint32_t) +
37 : capacity * sizeof(passgen_markov_node *);
38 : }
39 :
40 29801 : size_t passgen_markov_leaf_size(size_t capacity) {
41 29801 : return sizeof(passgen_markov_leaf) + capacity * sizeof(uint32_t) +
42 : capacity * sizeof(uint32_t);
43 : }
44 :
45 81735 : passgen_markov_node *passgen_markov_node_new(size_t size_index) {
46 81735 : size_t capacity = markov_sizes[size_index];
47 81735 : passgen_assert(capacity);
48 81735 : size_t size = passgen_markov_node_size(capacity);
49 81735 : passgen_markov_node *node = calloc(1, size);
50 81735 : memset(node, 0, size);
51 81735 : node->capacity = capacity;
52 81735 : return node;
53 : }
54 :
55 29801 : passgen_markov_leaf *passgen_markov_leaf_new(size_t size_index) {
56 29801 : size_t capacity = markov_sizes[size_index];
57 29801 : if(capacity == 0) {
58 0 : return NULL;
59 : }
60 29801 : size_t size = passgen_markov_leaf_size(capacity);
61 29801 : passgen_markov_leaf *leaf = calloc(1, size);
62 29801 : memset(leaf, 0, size);
63 29801 : leaf->capacity = capacity;
64 29801 : leaf->total_count = 0;
65 29801 : return leaf;
66 : }
67 :
68 29574 : void passgen_markov_leaf_free(passgen_markov_leaf *leaf) {
69 29574 : PASSGEN_CLEAR(leaf);
70 29574 : free(leaf);
71 29574 : }
72 :
73 80820 : void passgen_markov_node_free(passgen_markov_node *node, size_t level) {
74 7386804 : for(size_t i = 0; i < node->capacity; i++) {
75 7305984 : void *child = passgen_markov_node_child(node, i).leaf;
76 7305984 : if(child) {
77 110376 : if(level < 2) {
78 29571 : passgen_markov_leaf_free(child);
79 : } else {
80 80805 : passgen_markov_node_free(child, level - 1);
81 : }
82 : }
83 : }
84 :
85 80820 : PASSGEN_CLEAR(node);
86 80820 : free(node);
87 80820 : }
88 :
89 0 : void passgen_markov_node_check(passgen_markov_node *node, size_t level) {
90 0 : assert(node->capacity);
91 0 : assert(node->capacity % 2);
92 0 : for(size_t i = 0; i < node->capacity; i++) {
93 0 : void *data = passgen_markov_node_child(node, i).node;
94 0 : uint32_t codepoint = passgen_markov_node_codepoint(node, i);
95 0 : if(codepoint) {
96 0 : assert(data);
97 : }
98 0 : if(data) {
99 0 : assert((codepoint % node->capacity) == i);
100 0 : if(level > 2) {
101 0 : passgen_markov_node_check(data, level - 1);
102 : }
103 : }
104 : }
105 0 : }
106 :
107 0 : void passgen_markov_check(passgen_markov *markov) {
108 0 : passgen_markov_node_check(markov->root, markov->level);
109 0 : }
110 :
111 : // Initialize new markov chain with a given level
112 14 : void passgen_markov_init(passgen_markov *markov, uint8_t level) {
113 14 : passgen_assert(level);
114 14 : memset(markov, 0, sizeof(*markov));
115 14 : markov->level = level;
116 14 : markov->root = passgen_markov_node_new(0);
117 14 : }
118 :
119 225 : passgen_markov_leaf *passgen_markov_leaf_realloc(passgen_markov_leaf *leaf) {
120 225 : size_t size_index = 0;
121 1149 : while(markov_sizes[size_index] < leaf->capacity) {
122 924 : size_index++;
123 : }
124 :
125 225 : passgen_markov_leaf *new = passgen_markov_leaf_new(size_index + 1);
126 :
127 : // re-insert old elements
128 1371790 : for(size_t i = 0; i < leaf->capacity; i++) {
129 1371565 : if(passgen_markov_leaf_count_raw(leaf, i)) {
130 184422 : uint32_t codepoint = passgen_markov_leaf_codepoint_raw(leaf, i);
131 184422 : uint32_t count = passgen_markov_leaf_count_raw(leaf, i);
132 184422 : new = passgen_markov_leaf_insert(new, codepoint, count);
133 : }
134 : }
135 :
136 225 : free(leaf);
137 :
138 225 : return new;
139 : }
140 :
141 319731 : passgen_markov_leaf *passgen_markov_leaf_insert(
142 : passgen_markov_leaf *leaf,
143 : uint32_t codepoint,
144 : size_t weight) {
145 : // don't do anything if asked to insert a codepoint with an empty weight.
146 319731 : if(weight == 0) {
147 0 : return leaf;
148 : }
149 :
150 319956 : while(passgen_markov_leaf_count_raw(leaf, codepoint) &&
151 891 : passgen_markov_leaf_codepoint_raw(leaf, codepoint) != codepoint) {
152 225 : leaf = passgen_markov_leaf_realloc(leaf);
153 : }
154 :
155 : // set codepoint
156 319731 : passgen_markov_leaf_codepoint_raw(leaf, codepoint) = codepoint;
157 319731 : passgen_markov_leaf_count_raw(leaf, codepoint) += weight;
158 319731 : leaf->total_count += weight;
159 :
160 319731 : return leaf;
161 : }
162 :
163 913 : passgen_markov_node *passgen_markov_node_realloc(passgen_markov_node *node) {
164 913 : size_t size_index = 0;
165 4697 : while(markov_sizes[size_index] < node->capacity) {
166 3784 : size_index++;
167 : }
168 :
169 913 : passgen_markov_node *new = passgen_markov_node_new(size_index + 1);
170 :
171 : // re-insert old elements
172 7060542 : for(size_t i = 0; i < node->capacity; i++) {
173 7059629 : if(passgen_markov_node_child(node, i).node) {
174 37725 : uint32_t codepoint = passgen_markov_node_codepoint(node, i);
175 37725 : new = passgen_markov_node_insert(new, codepoint);
176 37725 : assert(!passgen_markov_node_child(new, codepoint).node);
177 37725 : passgen_markov_node_child(new, codepoint).node =
178 37725 : passgen_markov_node_child(node, i).node;
179 : }
180 : }
181 :
182 913 : free(node);
183 :
184 913 : return new;
185 : }
186 :
187 : passgen_markov_node *
188 193217 : passgen_markov_node_insert(passgen_markov_node *node, uint32_t codepoint) {
189 194130 : while(passgen_markov_node_child(node, codepoint).node &&
190 45029 : passgen_markov_node_codepoint(node, codepoint) != codepoint) {
191 913 : node = passgen_markov_node_realloc(node);
192 : }
193 :
194 : // set codepoint
195 193217 : passgen_markov_node_codepoint(node, codepoint) = codepoint;
196 :
197 193217 : return node;
198 : }
199 :
200 154492 : passgen_markov_node *passgen_markov_node_insert_word(
201 : passgen_markov_node *node,
202 : const uint32_t *sequence,
203 : size_t length,
204 : size_t weight) {
205 154492 : uint32_t codepoint = sequence[0];
206 154492 : node = passgen_markov_node_insert(node, codepoint);
207 :
208 154492 : if(length == 2) {
209 33308 : passgen_markov_leaf *child =
210 33308 : passgen_markov_node_child(node, codepoint).leaf;
211 :
212 33308 : if(!child) {
213 29571 : child = passgen_markov_leaf_new(0);
214 : }
215 :
216 33308 : child = passgen_markov_leaf_insert(child, sequence[1], weight);
217 :
218 33308 : passgen_markov_node_child(node, codepoint).leaf = child;
219 : } else {
220 : // retrieve or create child node
221 121184 : passgen_markov_node *child =
222 121184 : passgen_markov_node_child(node, codepoint).node;
223 121184 : if(!child) {
224 80805 : child = passgen_markov_node_new(0);
225 : }
226 :
227 : // insert word into child
228 121184 : child = passgen_markov_node_insert_word(
229 : child,
230 : &sequence[1],
231 : length - 1,
232 : weight);
233 :
234 : // save child
235 121184 : passgen_markov_node_child(node, codepoint).node = child;
236 : }
237 :
238 154492 : return node;
239 : }
240 :
241 33307 : void passgen_markov_insert(
242 : passgen_markov *markov,
243 : const uint32_t *sequence,
244 : size_t weight) {
245 66614 : markov->root = passgen_markov_node_insert_word(
246 : markov->root,
247 : sequence,
248 33307 : markov->level + 1,
249 : weight);
250 33307 : markov->count += weight;
251 33307 : }
252 :
253 : // Add a word to a markov chain
254 3091 : void passgen_markov_add(
255 : passgen_markov *markov,
256 : const uint32_t *word,
257 : size_t word_len,
258 3091 : size_t weight) {
259 : // insert start sequences
260 3091 : size_t sequence_len = 2 * markov->level + 1;
261 3091 : uint32_t sequence[sequence_len];
262 3091 : memset(sequence, 0, sizeof(sequence[0]) * sequence_len);
263 3091 : memcpy(
264 3091 : &sequence[markov->level],
265 : word,
266 3091 : sizeof(uint32_t) * MIN(markov->level, word_len));
267 15877 : for(size_t i = 0; i < MIN(markov->level, word_len); i++) {
268 12786 : passgen_markov_insert(markov, &sequence[i], weight);
269 : }
270 20521 : for(size_t i = 0; i < SATURATING_SUB(word_len, markov->level); i++) {
271 17430 : passgen_markov_insert(markov, &word[i], weight);
272 : }
273 3091 : memset(sequence, 0, sizeof(sequence[0]) * sequence_len);
274 3091 : memcpy(
275 3091 : &sequence[SATURATING_SUB(markov->level, word_len)],
276 3091 : &word[SATURATING_SUB(word_len, markov->level)],
277 3091 : sizeof(uint32_t) * MIN(markov->level, word_len));
278 3091 : passgen_markov_insert(markov, &sequence[0], weight);
279 3091 : }
280 :
281 : // Free a markov chain
282 14 : void passgen_markov_free(passgen_markov *markov) {
283 14 : passgen_markov_node_free(markov->root, markov->level);
284 14 : PASSGEN_CLEAR(markov);
285 14 : }
286 :
287 13000 : uint32_t passgen_markov_generate(
288 : passgen_markov *markov,
289 : const uint32_t *current,
290 : passgen_random *random,
291 : double *entropy) {
292 13000 : passgen_markov_node *node = markov->root;
293 :
294 52000 : for(size_t i = 0; i < markov->level; i++) {
295 39000 : node =
296 39000 : (passgen_markov_node *) passgen_markov_node_child(node, current[i])
297 : .leaf;
298 39000 : assert(node);
299 : }
300 :
301 13000 : passgen_markov_leaf *leaf = (passgen_markov_leaf *) node;
302 :
303 : // record total choices.
304 13000 : if(entropy) {
305 0 : *entropy *= (double) leaf->total_count;
306 : }
307 :
308 13000 : size_t choice = passgen_random_u64_max(random, leaf->total_count);
309 43198589 : for(size_t i = 0; i < leaf->capacity; i++) {
310 43198589 : uint32_t count = passgen_markov_leaf_count_raw(leaf, i);
311 43198589 : if(count) {
312 512513 : if(choice < count) {
313 : // record bucket (reduces total choices).
314 13000 : if(entropy) {
315 0 : *entropy /= (double) count;
316 : }
317 :
318 13000 : return passgen_markov_leaf_codepoint_raw(leaf, i);
319 : } else {
320 499513 : choice -= count;
321 : }
322 : }
323 : }
324 :
325 0 : assert(false);
326 :
327 : return 0;
328 : }
329 :
330 : uint32_t
331 212000 : passgen_markov_leaf_count(passgen_markov_leaf *leaf, uint32_t codepoint) {
332 212000 : if(passgen_markov_leaf_codepoint_raw(leaf, codepoint) == codepoint) {
333 101007 : return passgen_markov_leaf_count_raw(leaf, codepoint);
334 : } else {
335 110993 : return 0;
336 : }
337 : }
|