LCOV - code coverage report
Current view: top level - src - markov.c (source / functions) Hit Total Coverage
Test: passgen-test.info Lines: 128 165 77.6 %
Date: 2024-05-03 06:05:14 Functions: 16 19 84.2 %

          Line data    Source code
       1             : #include "passgen/markov.h"
       2             : #include "passgen/assert.h"
       3             : #include <assert.h>
       4             : #include <stdlib.h>
       5             : #include <string.h>
       6             : #define MIN(a, b)            ((a) < (b) ? (a) : (b))
       7             : #define SATURATING_SUB(a, b) ((a) -MIN(a, b))
       8             : 
       9             : #define MARKOV_LEAF_INITIAL    4
      10             : #define MARKOV_LEAF_MULTIPLIER 2
      11             : 
      12             : #define passgen_markov_leaf_count_raw(leaf, codepoint) \
      13             :     leaf->entries[leaf->capacity].count[codepoint % leaf->capacity]
      14             : 
      15             : #define passgen_markov_leaf_codepoint_raw(leaf, index) \
      16             :     leaf->entries[0].codepoint[index % leaf->capacity]
      17             : 
      18             : /// Sizes for the leaf and node hash maps. These are all prime numbers, with the
      19             : /// property that every number is greater than double the previous one.
      20             : /// Generated with Ruby with:
      21             : ///
      22             : /// ```ruby
      23             : /// require 'prime'
      24             : /// p Prime.take(10000).inject([3]){|l, p| l << p if (2*l.last) < p; l}
      25             : /// ```
      26             : static const size_t markov_sizes[] = {
      27             :     3,        7,        17,       37,     79,      163,     331,
      28             :     673,      1361,     2729,     5471,   10949,   21911,   43853,
      29             :     87719,    175447,   350899,   701819, 1403641, 2807303, 5614657,
      30             :     11229331, 22458671, 44917381, 0};
      31             : 
      32       30827 : size_t passgen_markov_node_size(size_t capacity) {
      33             :     // we use capacity + 1 there to make sure it's a round number, such that the
      34             :     // child pointers are not misaligned.
      35       30827 :     return sizeof(passgen_markov_node) + (capacity + 1) * sizeof(uint32_t) +
      36             :            capacity * sizeof(passgen_markov_node *);
      37             : }
      38             : 
      39        9380 : size_t passgen_markov_leaf_size(size_t capacity) {
      40        9380 :     return sizeof(passgen_markov_leaf) + capacity * sizeof(uint32_t) +
      41             :            capacity * sizeof(uint32_t);
      42             : }
      43             : 
      44       30827 : passgen_markov_node *passgen_markov_node_new(size_t size_index) {
      45       30827 :     size_t capacity = markov_sizes[size_index];
      46       30827 :     passgen_assert(capacity);
      47       30827 :     size_t size = passgen_markov_node_size(capacity);
      48       30827 :     passgen_markov_node *node = calloc(1, size);
      49       30827 :     memset(node, 0, size);
      50       30827 :     node->capacity = capacity;
      51       30827 :     return node;
      52             : }
      53             : 
      54        9380 : passgen_markov_leaf *passgen_markov_leaf_new(size_t size_index) {
      55        9380 :     size_t capacity = markov_sizes[size_index];
      56        9380 :     if(capacity == 0) {
      57           0 :         return NULL;
      58             :     }
      59        9380 :     size_t size = passgen_markov_leaf_size(capacity);
      60        9380 :     passgen_markov_leaf *leaf = calloc(1, size);
      61        9380 :     memset(leaf, 0, size);
      62        9380 :     leaf->capacity = capacity;
      63        9380 :     leaf->total_count = 0;
      64        9380 :     return leaf;
      65             : }
      66             : 
      67        9180 : void passgen_markov_leaf_free(passgen_markov_leaf *leaf) {
      68        9180 :     free(leaf);
      69        9180 : }
      70             : 
      71       30237 : void passgen_markov_node_free(passgen_markov_node *node, size_t level) {
      72     4117450 :     for(size_t i = 0; i < node->capacity; i++) {
      73     4087213 :         void *child = passgen_markov_node_child(node, i).leaf;
      74     4087213 :         if(child) {
      75       39400 :             if(level < 2) {
      76        9177 :                 passgen_markov_leaf_free(child);
      77             :             } else {
      78       30223 :                 passgen_markov_node_free(child, level - 1);
      79             :             }
      80             :         }
      81             :     }
      82             : 
      83       30237 :     free(node);
      84       30237 : }
      85             : 
      86           0 : void passgen_markov_node_check(passgen_markov_node *node, size_t level) {
      87           0 :     assert(node->capacity);
      88           0 :     assert(node->capacity % 2);
      89           0 :     for(size_t i = 0; i < node->capacity; i++) {
      90           0 :         void *data = passgen_markov_node_child(node, i).node;
      91           0 :         uint32_t codepoint = passgen_markov_node_codepoint(node, i);
      92           0 :         if(codepoint) {
      93           0 :             assert(data);
      94             :         }
      95           0 :         if(data) {
      96           0 :             assert((codepoint % node->capacity) == i);
      97           0 :             if(level > 2) {
      98           0 :                 passgen_markov_node_check(data, level - 1);
      99             :             }
     100             :         }
     101             :     }
     102           0 : }
     103             : 
     104           0 : void passgen_markov_check(passgen_markov *markov) {
     105           0 :     passgen_markov_node_check(markov->root, markov->level);
     106           0 : }
     107             : 
     108             : // Initialize new markov chain with a given level
     109          13 : void passgen_markov_init(passgen_markov *markov, uint8_t level) {
     110          13 :     passgen_assert(level);
     111          13 :     memset(markov, 0, sizeof(*markov));
     112          13 :     markov->level = level;
     113          13 :     markov->root = passgen_markov_node_new(0);
     114          13 : }
     115             : 
     116         198 : passgen_markov_leaf *passgen_markov_leaf_realloc(passgen_markov_leaf *leaf) {
     117         198 :     size_t size_index = 0;
     118         988 :     while(markov_sizes[size_index] < leaf->capacity) {
     119         790 :         size_index++;
     120             :     }
     121             : 
     122         198 :     passgen_markov_leaf *new = passgen_markov_leaf_new(size_index + 1);
     123             : 
     124             :     // re-insert old elements
     125    45166668 :     for(size_t i = 0; i < leaf->capacity; i++) {
     126    45166470 :         if(passgen_markov_leaf_count_raw(leaf, i)) {
     127      193682 :             uint32_t codepoint = passgen_markov_leaf_codepoint_raw(leaf, i);
     128      193682 :             uint32_t count = passgen_markov_leaf_count_raw(leaf, i);
     129      193682 :             new = passgen_markov_leaf_insert(new, codepoint, count);
     130             :         }
     131             :     }
     132             : 
     133         198 :     free(leaf);
     134             : 
     135         198 :     return new;
     136             : }
     137             : 
     138      315891 : passgen_markov_leaf *passgen_markov_leaf_insert(
     139             :     passgen_markov_leaf *leaf,
     140             :     uint32_t codepoint,
     141             :     size_t weight) {
     142             :     // don't do anything if asked to insert a codepoint with an empty weight.
     143      315891 :     if(weight == 0) {
     144           0 :         return leaf;
     145             :     }
     146             : 
     147      316089 :     while(passgen_markov_leaf_count_raw(leaf, codepoint) &&
     148         871 :           passgen_markov_leaf_codepoint_raw(leaf, codepoint) != codepoint) {
     149         198 :         leaf = passgen_markov_leaf_realloc(leaf);
     150             :     }
     151             : 
     152             :     // set codepoint
     153      315891 :     passgen_markov_leaf_codepoint_raw(leaf, codepoint) = codepoint;
     154      315891 :     passgen_markov_leaf_count_raw(leaf, codepoint) += weight;
     155      315891 :     leaf->total_count += weight;
     156             : 
     157      315891 :     return leaf;
     158             : }
     159             : 
     160         588 : passgen_markov_node *passgen_markov_node_realloc(passgen_markov_node *node) {
     161         588 :     size_t size_index = 0;
     162        3134 :     while(markov_sizes[size_index] < node->capacity) {
     163        2546 :         size_index++;
     164             :     }
     165             : 
     166         588 :     passgen_markov_node *new = passgen_markov_node_new(size_index + 1);
     167             : 
     168             :     // re-insert old elements
     169     3994742 :     for(size_t i = 0; i < node->capacity; i++) {
     170     3994154 :         if(passgen_markov_node_child(node, i).node) {
     171       22089 :             uint32_t codepoint = passgen_markov_node_codepoint(node, i);
     172       22089 :             new = passgen_markov_node_insert(new, codepoint);
     173       22089 :             assert(!passgen_markov_node_child(new, codepoint).node);
     174       22089 :             passgen_markov_node_child(new, codepoint).node =
     175       22089 :                 passgen_markov_node_child(node, i).node;
     176             :         }
     177             :     }
     178             : 
     179         588 :     free(node);
     180             : 
     181         588 :     return new;
     182             : }
     183             : 
     184             : passgen_markov_node *
     185       83981 : passgen_markov_node_insert(passgen_markov_node *node, uint32_t codepoint) {
     186       84569 :     while(passgen_markov_node_child(node, codepoint).node &&
     187       22080 :           passgen_markov_node_codepoint(node, codepoint) != codepoint) {
     188         588 :         node = passgen_markov_node_realloc(node);
     189             :     }
     190             : 
     191             :     // set codepoint
     192       83981 :     passgen_markov_node_codepoint(node, codepoint) = codepoint;
     193             : 
     194       83981 :     return node;
     195             : }
     196             : 
     197       60892 : passgen_markov_node *passgen_markov_node_insert_word(
     198             :     passgen_markov_node *node,
     199             :     const uint32_t *sequence,
     200             :     size_t length,
     201             :     size_t weight) {
     202       60892 :     uint32_t codepoint = sequence[0];
     203       60892 :     node = passgen_markov_node_insert(node, codepoint);
     204             : 
     205       60892 :     if(length == 2) {
     206       11208 :         passgen_markov_leaf *child =
     207       11208 :             passgen_markov_node_child(node, codepoint).leaf;
     208             : 
     209       11208 :         if(!child) {
     210        9177 :             child = passgen_markov_leaf_new(0);
     211             :         }
     212             : 
     213       11208 :         child = passgen_markov_leaf_insert(child, sequence[1], weight);
     214             : 
     215       11208 :         passgen_markov_node_child(node, codepoint).leaf = child;
     216             :     } else {
     217             :         // retrieve or create child node
     218       49684 :         passgen_markov_node *child =
     219       49684 :             passgen_markov_node_child(node, codepoint).node;
     220       49684 :         if(!child) {
     221       30223 :             child = passgen_markov_node_new(0);
     222             :         }
     223             : 
     224             :         // insert word into child
     225       49684 :         child = passgen_markov_node_insert_word(
     226             :             child,
     227             :             &sequence[1],
     228             :             length - 1,
     229             :             weight);
     230             : 
     231             :         // save child
     232       49684 :         passgen_markov_node_child(node, codepoint).node = child;
     233             :     }
     234             : 
     235       60892 :     return node;
     236             : }
     237             : 
     238       11207 : void passgen_markov_insert(
     239             :     passgen_markov *markov,
     240             :     const uint32_t *sequence,
     241             :     size_t weight) {
     242       22414 :     markov->root = passgen_markov_node_insert_word(
     243             :         markov->root,
     244             :         sequence,
     245       11207 :         markov->level + 1,
     246             :         weight);
     247       11207 :     markov->count += weight;
     248       11207 : }
     249             : 
     250             : // Add a word to a markov chain
     251        1391 : void passgen_markov_add(
     252             :     passgen_markov *markov,
     253             :     const uint32_t *word,
     254             :     size_t word_len,
     255        1391 :     size_t weight) {
     256             :     // insert start sequences
     257        1391 :     size_t sequence_len = 2 * markov->level + 1;
     258        1391 :     uint32_t sequence[sequence_len];
     259        1391 :     memset(sequence, 0, sizeof(sequence[0]) * sequence_len);
     260        1391 :     memcpy(
     261        1391 :         &sequence[markov->level],
     262             :         word,
     263        1391 :         sizeof(uint32_t) * MIN(markov->level, word_len));
     264        6977 :     for(size_t i = 0; i < MIN(markov->level, word_len); i++) {
     265        5586 :         passgen_markov_insert(markov, &sequence[i], weight);
     266             :     }
     267        5621 :     for(size_t i = 0; i < SATURATING_SUB(word_len, markov->level); i++) {
     268        4230 :         passgen_markov_insert(markov, &word[i], weight);
     269             :     }
     270        1391 :     memset(sequence, 0, sizeof(sequence[0]) * sequence_len);
     271        1391 :     memcpy(
     272        1391 :         &sequence[SATURATING_SUB(markov->level, word_len)],
     273        1391 :         &word[SATURATING_SUB(word_len, markov->level)],
     274        1391 :         sizeof(uint32_t) * MIN(markov->level, word_len));
     275        1391 :     passgen_markov_insert(markov, &sequence[0], weight);
     276        1391 : }
     277             : 
     278             : // Free a markov chain
     279          13 : void passgen_markov_free(passgen_markov *markov) {
     280          13 :     passgen_markov_node_free(markov->root, markov->level);
     281          13 : }
     282             : 
     283           0 : uint32_t passgen_markov_generate(
     284             :     passgen_markov *markov,
     285             :     const uint32_t *current,
     286             :     passgen_random *random,
     287             :     double *entropy) {
     288           0 :     passgen_markov_node *node = markov->root;
     289             : 
     290           0 :     for(size_t i = 0; i < markov->level; i++) {
     291           0 :         node =
     292           0 :             (passgen_markov_node *) passgen_markov_node_child(node, current[i])
     293             :                 .leaf;
     294           0 :         assert(node);
     295             :     }
     296             : 
     297           0 :     passgen_markov_leaf *leaf = (passgen_markov_leaf *) node;
     298             : 
     299             :     // record total choices.
     300           0 :     if(entropy) {
     301           0 :         *entropy *= (double) leaf->total_count;
     302             :     }
     303             : 
     304           0 :     size_t choice = passgen_random_u64_max(random, leaf->total_count);
     305           0 :     for(size_t i = 0; i < leaf->capacity; i++) {
     306           0 :         uint32_t count = passgen_markov_leaf_count_raw(leaf, i);
     307           0 :         if(count) {
     308           0 :             if(choice < count) {
     309             :                 // record bucket (reduces total choices).
     310           0 :                 if(entropy) {
     311           0 :                     *entropy /= (double) count;
     312             :                 }
     313             : 
     314           0 :                 return passgen_markov_leaf_codepoint_raw(leaf, i);
     315             :             } else {
     316           0 :                 choice -= count;
     317             :             }
     318             :         }
     319             :     }
     320             : 
     321           0 :     assert(false);
     322             : 
     323             :     return 0;
     324             : }
     325             : 
     326             : uint32_t
     327      221000 : passgen_markov_leaf_count(passgen_markov_leaf *leaf, uint32_t codepoint) {
     328      221000 :     if(passgen_markov_leaf_codepoint_raw(leaf, codepoint) == codepoint) {
     329      110021 :         return passgen_markov_leaf_count_raw(leaf, codepoint);
     330             :     } else {
     331      110979 :         return 0;
     332             :     }
     333             : }

Generated by: LCOV version 1.14