/builds/xfbs/passgen/include/passgen/markov.h
Line | Count | Source |
1 | | /// @file markov.h |
2 | | /// @author Patrick M. Elsen <pelsen@xfbs.net> |
3 | | /// @brief Markov chain data structure and functions. |
4 | | /// |
5 | | /// The data structures and functions in this file are used to generate |
6 | | /// pronounceable phrases by loading in a word list, generating a Markov chain |
7 | | /// from it, and then using the Markov chain to generate sequences that are |
8 | | /// pronounceable in English but not real words. |
9 | | #pragma once |
10 | | #include "passgen/util/random.h" |
11 | | #include <stddef.h> |
12 | | #include <stdint.h> |
13 | | #include <stdbool.h> |
14 | | |
15 | | /// Get the child with the given codepoint. |
16 | | /// @memberof passgen_markov_node |
17 | | /// @param node Node to get child of |
18 | | /// @param codepoint Codepoint to lookup child for |
19 | | #define passgen_markov_node_child(node, codepoint) \ |
20 | 8.33M | node->entries[(node->capacity + 1) / 2].child[codepoint % node->capacity] |
21 | | |
22 | | /// Get the codepoint at the given index |
23 | | /// @memberof passgen_markov_node |
24 | | /// @param node Node to get codepoint for |
25 | | /// @param index Index to lookup |
26 | | #define passgen_markov_node_codepoint(node, index) \ |
27 | 128k | node->entries[0].codepoint[index % node->capacity] |
28 | | |
29 | | /// Leaf node in the Markov chain. This contains a sorted list of |
30 | | /// codepoint-count tuples. During insertion, these are sorted by the |
31 | | /// codepoint. When finalizing the Markov chain, they might be reordered to be |
32 | | /// sorted by the frequency to allow for more efficient generation. |
33 | | typedef struct { |
34 | | size_t total_count; |
35 | | size_t capacity; |
36 | | union { |
37 | | uint32_t count[1]; |
38 | | uint32_t codepoint[1]; |
39 | | } entries[0]; |
40 | | } passgen_markov_leaf; |
41 | | |
42 | | /// Intermediate nodes in the markov chain tree. Implemented as custom hash map |
43 | | /// where the keys are codepoints and the values are nodes or leafs. |
44 | | typedef struct passgen_markov_node { |
45 | | size_t capacity; |
46 | | union { |
47 | | uint32_t codepoint[1]; |
48 | | union { |
49 | | struct passgen_markov_node *node; |
50 | | passgen_markov_leaf *leaf; |
51 | | } child[1]; |
52 | | } entries[0]; |
53 | | } passgen_markov_node; |
54 | | |
55 | | /// Markov chain, used to generate pronounceable random words |
56 | | /// |
57 | | /// Contains a tree of unicode codepoints and frequency data. |
58 | | /// |
59 | | /// This data structure answers the question "given the previous characters x |
60 | | /// and y, what are the probabilities for the next character?". Using |
61 | | /// #passgen_markov_add, you can add words to it. Given a sequence of |
62 | | /// characters, using #passgen_markov_generate you can generate a new |
63 | | /// character at random. |
64 | | /// |
65 | | /// The @ref length of a Markov chain is the length of tuples that it looks at. |
66 | | typedef struct { |
67 | | /// Length of codepoint tuples that this Markov chains stores. |
68 | | size_t level; |
69 | | /// How many words are in this Markov chain |
70 | | size_t count; |
71 | | /// Root node of markov chain |
72 | | passgen_markov_node *root; |
73 | | } passgen_markov; |
74 | | |
75 | | |
76 | | /// Allocates a new Markov chain node with a size determined by the @ref |
77 | | /// size_index. |
78 | | /// |
79 | | /// Markov chain nodes only exist in specific, discrete sizes that are prime |
80 | | /// numbers. This is because they implement a kind of hash map and that makes |
81 | | /// them more efficient. The @ref size_index is an index into a static array |
82 | | /// of precomputed prime sizes. |
83 | | /// |
84 | | /// @memberof passgen_markov_node |
85 | | passgen_markov_node *passgen_markov_node_new(size_t size_index); |
86 | | |
87 | | /// Free a Markov chain node. |
88 | | /// |
89 | | /// The parameter @ref level is the level of the Markov chain, this needs to be |
90 | | /// passed so that the method knows how deep to free data. |
91 | | /// |
92 | | /// @memberof passgen_markov_node |
93 | | void passgen_markov_node_free(passgen_markov_node *node, size_t level); |
94 | | |
95 | | /// Insert a new codepoint into a Markov chain node. |
96 | | /// |
97 | | /// This returns a pointer to a passgen_markov_node because the node may need |
98 | | /// to be reallocated if it would otherwise overflow. |
99 | | /// |
100 | | /// @memberof passgen_markov_node |
101 | | passgen_markov_node *passgen_markov_node_insert(passgen_markov_node *node, uint32_t codepoint); |
102 | | |
103 | | /// Insert a new word into this Markov chain node. |
104 | | /// |
105 | | /// This will recursively insert the word into the appropriate child nodes. It |
106 | | /// needs to know the @ref length of the word (this is equal to the @ref level |
107 | | /// of the Markov chain, minus the depth it has already traversed) and the |
108 | | /// weight of the word. |
109 | | /// |
110 | | /// @memberof passgen_markov_node |
111 | | passgen_markov_node *passgen_markov_node_insert_word( |
112 | | passgen_markov_node *node, |
113 | | const uint32_t *sequence, |
114 | | size_t length, |
115 | | size_t weight); |
116 | | |
117 | | /// Allocates a new Markov leaf node with a size determined by the @ref |
118 | | /// size_index. |
119 | | /// |
120 | | /// Markov leaf nodes only exist in specific, discrete sizes that are prime |
121 | | /// numbers. This is because they implement a kind of hash map that makes them |
122 | | /// more efficient. The @ref size_index is an index into a static array of |
123 | | /// precomputed prime sizes. |
124 | | /// |
125 | | /// @memberof passgen_markov_leaf |
126 | | passgen_markov_leaf *passgen_markov_leaf_new(size_t size_index); |
127 | | |
128 | | /// Free a Markov leaf node. |
129 | | /// |
130 | | /// @memberof passgen_markov_leaf |
131 | | void passgen_markov_leaf_free(passgen_markov_leaf *leaf); |
132 | | |
133 | | /// Insert a final codepoint and weight into a Markov leaf node. |
134 | | /// |
135 | | /// @memberof passgen_markov_leaf |
136 | | passgen_markov_leaf *passgen_markov_leaf_insert(passgen_markov_leaf *leaf, uint32_t codepoint, size_t weight); |
137 | | |
138 | | /// Given a codepoint, return a count or zero if the codepoint does not exist. |
139 | | /// |
140 | | /// @memberof passgen_markov_leaf |
141 | | uint32_t passgen_markov_leaf_count(passgen_markov_leaf *leaf, uint32_t codepoint); |
142 | | |
143 | | /// Initialize new markov chain with a given level |
144 | | /// |
145 | | /// @memberof passgen_markov |
146 | | void passgen_markov_init(passgen_markov *markov, uint8_t level); |
147 | | |
148 | | /// Add a word to a markov chain |
149 | | /// |
150 | | /// @memberof passgen_markov |
151 | | void passgen_markov_add( |
152 | | passgen_markov *markov, |
153 | | const uint32_t *word, |
154 | | size_t word_len, |
155 | | size_t weight); |
156 | | |
157 | | /// Generate the next character at random using the Markov chain and a @ref passgen_random instance. |
158 | | /// @param current Must contain markov->level amount of previous chars |
159 | | /// @param markov Must be a parsed markov chain |
160 | | /// @param random Must be a valid randomness instance |
161 | | /// @param complexity Will be multiplied with the complexity of the choice if it not NULL. |
162 | | /// @return The generated next character |
163 | | /// @memberof passgen_markov |
164 | | uint32_t passgen_markov_generate( |
165 | | passgen_markov *markov, |
166 | | const uint32_t *current, |
167 | | passgen_random *random, |
168 | | double *complexity); |
169 | | |
170 | | /// Free a markov chain |
171 | | /// |
172 | | /// @memberof passgen_markov |
173 | | void passgen_markov_free(passgen_markov *markov); |