LCOV - code coverage report
Current view: top level - src - markov.c (source / functions) Hit Total Coverage
Test: passgen-test.info Lines: 147 168 87.5 %
Date: 2024-11-15 06:04:45 Functions: 17 19 89.5 %

          Line data    Source code
       1             : #include "passgen/markov.h"
       2             : #include "passgen/assert.h"
       3             : #include "passgen/config.h"
       4             : #include <assert.h>
       5             : #include <stdlib.h>
       6             : #include <string.h>
       7             : #define MIN(a, b)            ((a) < (b) ? (a) : (b))
       8             : #define SATURATING_SUB(a, b) ((a) -MIN(a, b))
       9             : 
      10             : #define MARKOV_LEAF_INITIAL    4
      11             : #define MARKOV_LEAF_MULTIPLIER 2
      12             : 
      13             : #define passgen_markov_leaf_count_raw(leaf, codepoint) \
      14             :     leaf->entries[leaf->capacity].count[codepoint % leaf->capacity]
      15             : 
      16             : #define passgen_markov_leaf_codepoint_raw(leaf, index) \
      17             :     leaf->entries[0].codepoint[index % leaf->capacity]
      18             : 
      19             : /// Sizes for the leaf and node hash maps. These are all prime numbers, with the
      20             : /// property that every number is greater than double the previous one.
      21             : /// Generated with Ruby with:
      22             : ///
      23             : /// ```ruby
      24             : /// require 'prime'
      25             : /// p Prime.take(10000).inject([3]){|l, p| l << p if (2*l.last) < p; l}
      26             : /// ```
      27             : static const size_t markov_sizes[] = {
      28             :     3,        7,        17,       37,     79,      163,     331,
      29             :     673,      1361,     2729,     5471,   10949,   21911,   43853,
      30             :     87719,    175447,   350899,   701819, 1403641, 2807303, 5614657,
      31             :     11229331, 22458671, 44917381, 0};
      32             : 
      33       81735 : size_t passgen_markov_node_size(size_t capacity) {
      34             :     // we use capacity + 1 there to make sure it's a round number, such that the
      35             :     // child pointers are not misaligned.
      36       81735 :     return sizeof(passgen_markov_node) + (capacity + 1) * sizeof(uint32_t) +
      37             :            capacity * sizeof(passgen_markov_node *);
      38             : }
      39             : 
      40       29801 : size_t passgen_markov_leaf_size(size_t capacity) {
      41       29801 :     return sizeof(passgen_markov_leaf) + capacity * sizeof(uint32_t) +
      42             :            capacity * sizeof(uint32_t);
      43             : }
      44             : 
      45       81735 : passgen_markov_node *passgen_markov_node_new(size_t size_index) {
      46       81735 :     size_t capacity = markov_sizes[size_index];
      47       81735 :     passgen_assert(capacity);
      48       81735 :     size_t size = passgen_markov_node_size(capacity);
      49       81735 :     passgen_markov_node *node = calloc(1, size);
      50       81735 :     memset(node, 0, size);
      51       81735 :     node->capacity = capacity;
      52       81735 :     return node;
      53             : }
      54             : 
      55       29801 : passgen_markov_leaf *passgen_markov_leaf_new(size_t size_index) {
      56       29801 :     size_t capacity = markov_sizes[size_index];
      57       29801 :     if(capacity == 0) {
      58           0 :         return NULL;
      59             :     }
      60       29801 :     size_t size = passgen_markov_leaf_size(capacity);
      61       29801 :     passgen_markov_leaf *leaf = calloc(1, size);
      62       29801 :     memset(leaf, 0, size);
      63       29801 :     leaf->capacity = capacity;
      64       29801 :     leaf->total_count = 0;
      65       29801 :     return leaf;
      66             : }
      67             : 
      68       29574 : void passgen_markov_leaf_free(passgen_markov_leaf *leaf) {
      69       29574 :     PASSGEN_CLEAR(leaf);
      70       29574 :     free(leaf);
      71       29574 : }
      72             : 
      73       80820 : void passgen_markov_node_free(passgen_markov_node *node, size_t level) {
      74     7386804 :     for(size_t i = 0; i < node->capacity; i++) {
      75     7305984 :         void *child = passgen_markov_node_child(node, i).leaf;
      76     7305984 :         if(child) {
      77      110376 :             if(level < 2) {
      78       29571 :                 passgen_markov_leaf_free(child);
      79             :             } else {
      80       80805 :                 passgen_markov_node_free(child, level - 1);
      81             :             }
      82             :         }
      83             :     }
      84             : 
      85       80820 :     PASSGEN_CLEAR(node);
      86       80820 :     free(node);
      87       80820 : }
      88             : 
      89           0 : void passgen_markov_node_check(passgen_markov_node *node, size_t level) {
      90           0 :     assert(node->capacity);
      91           0 :     assert(node->capacity % 2);
      92           0 :     for(size_t i = 0; i < node->capacity; i++) {
      93           0 :         void *data = passgen_markov_node_child(node, i).node;
      94           0 :         uint32_t codepoint = passgen_markov_node_codepoint(node, i);
      95           0 :         if(codepoint) {
      96           0 :             assert(data);
      97             :         }
      98           0 :         if(data) {
      99           0 :             assert((codepoint % node->capacity) == i);
     100           0 :             if(level > 2) {
     101           0 :                 passgen_markov_node_check(data, level - 1);
     102             :             }
     103             :         }
     104             :     }
     105           0 : }
     106             : 
     107           0 : void passgen_markov_check(passgen_markov *markov) {
     108           0 :     passgen_markov_node_check(markov->root, markov->level);
     109           0 : }
     110             : 
     111             : // Initialize new markov chain with a given level
     112          14 : void passgen_markov_init(passgen_markov *markov, uint8_t level) {
     113          14 :     passgen_assert(level);
     114          14 :     memset(markov, 0, sizeof(*markov));
     115          14 :     markov->level = level;
     116          14 :     markov->root = passgen_markov_node_new(0);
     117          14 : }
     118             : 
     119         225 : passgen_markov_leaf *passgen_markov_leaf_realloc(passgen_markov_leaf *leaf) {
     120         225 :     size_t size_index = 0;
     121        1149 :     while(markov_sizes[size_index] < leaf->capacity) {
     122         924 :         size_index++;
     123             :     }
     124             : 
     125         225 :     passgen_markov_leaf *new = passgen_markov_leaf_new(size_index + 1);
     126             : 
     127             :     // re-insert old elements
     128     1371790 :     for(size_t i = 0; i < leaf->capacity; i++) {
     129     1371565 :         if(passgen_markov_leaf_count_raw(leaf, i)) {
     130      184422 :             uint32_t codepoint = passgen_markov_leaf_codepoint_raw(leaf, i);
     131      184422 :             uint32_t count = passgen_markov_leaf_count_raw(leaf, i);
     132      184422 :             new = passgen_markov_leaf_insert(new, codepoint, count);
     133             :         }
     134             :     }
     135             : 
     136         225 :     free(leaf);
     137             : 
     138         225 :     return new;
     139             : }
     140             : 
     141      319731 : passgen_markov_leaf *passgen_markov_leaf_insert(
     142             :     passgen_markov_leaf *leaf,
     143             :     uint32_t codepoint,
     144             :     size_t weight) {
     145             :     // don't do anything if asked to insert a codepoint with an empty weight.
     146      319731 :     if(weight == 0) {
     147           0 :         return leaf;
     148             :     }
     149             : 
     150      319956 :     while(passgen_markov_leaf_count_raw(leaf, codepoint) &&
     151         891 :           passgen_markov_leaf_codepoint_raw(leaf, codepoint) != codepoint) {
     152         225 :         leaf = passgen_markov_leaf_realloc(leaf);
     153             :     }
     154             : 
     155             :     // set codepoint
     156      319731 :     passgen_markov_leaf_codepoint_raw(leaf, codepoint) = codepoint;
     157      319731 :     passgen_markov_leaf_count_raw(leaf, codepoint) += weight;
     158      319731 :     leaf->total_count += weight;
     159             : 
     160      319731 :     return leaf;
     161             : }
     162             : 
     163         913 : passgen_markov_node *passgen_markov_node_realloc(passgen_markov_node *node) {
     164         913 :     size_t size_index = 0;
     165        4697 :     while(markov_sizes[size_index] < node->capacity) {
     166        3784 :         size_index++;
     167             :     }
     168             : 
     169         913 :     passgen_markov_node *new = passgen_markov_node_new(size_index + 1);
     170             : 
     171             :     // re-insert old elements
     172     7060542 :     for(size_t i = 0; i < node->capacity; i++) {
     173     7059629 :         if(passgen_markov_node_child(node, i).node) {
     174       37725 :             uint32_t codepoint = passgen_markov_node_codepoint(node, i);
     175       37725 :             new = passgen_markov_node_insert(new, codepoint);
     176       37725 :             assert(!passgen_markov_node_child(new, codepoint).node);
     177       37725 :             passgen_markov_node_child(new, codepoint).node =
     178       37725 :                 passgen_markov_node_child(node, i).node;
     179             :         }
     180             :     }
     181             : 
     182         913 :     free(node);
     183             : 
     184         913 :     return new;
     185             : }
     186             : 
     187             : passgen_markov_node *
     188      193217 : passgen_markov_node_insert(passgen_markov_node *node, uint32_t codepoint) {
     189      194130 :     while(passgen_markov_node_child(node, codepoint).node &&
     190       45029 :           passgen_markov_node_codepoint(node, codepoint) != codepoint) {
     191         913 :         node = passgen_markov_node_realloc(node);
     192             :     }
     193             : 
     194             :     // set codepoint
     195      193217 :     passgen_markov_node_codepoint(node, codepoint) = codepoint;
     196             : 
     197      193217 :     return node;
     198             : }
     199             : 
     200      154492 : passgen_markov_node *passgen_markov_node_insert_word(
     201             :     passgen_markov_node *node,
     202             :     const uint32_t *sequence,
     203             :     size_t length,
     204             :     size_t weight) {
     205      154492 :     uint32_t codepoint = sequence[0];
     206      154492 :     node = passgen_markov_node_insert(node, codepoint);
     207             : 
     208      154492 :     if(length == 2) {
     209       33308 :         passgen_markov_leaf *child =
     210       33308 :             passgen_markov_node_child(node, codepoint).leaf;
     211             : 
     212       33308 :         if(!child) {
     213       29571 :             child = passgen_markov_leaf_new(0);
     214             :         }
     215             : 
     216       33308 :         child = passgen_markov_leaf_insert(child, sequence[1], weight);
     217             : 
     218       33308 :         passgen_markov_node_child(node, codepoint).leaf = child;
     219             :     } else {
     220             :         // retrieve or create child node
     221      121184 :         passgen_markov_node *child =
     222      121184 :             passgen_markov_node_child(node, codepoint).node;
     223      121184 :         if(!child) {
     224       80805 :             child = passgen_markov_node_new(0);
     225             :         }
     226             : 
     227             :         // insert word into child
     228      121184 :         child = passgen_markov_node_insert_word(
     229             :             child,
     230             :             &sequence[1],
     231             :             length - 1,
     232             :             weight);
     233             : 
     234             :         // save child
     235      121184 :         passgen_markov_node_child(node, codepoint).node = child;
     236             :     }
     237             : 
     238      154492 :     return node;
     239             : }
     240             : 
     241       33307 : void passgen_markov_insert(
     242             :     passgen_markov *markov,
     243             :     const uint32_t *sequence,
     244             :     size_t weight) {
     245       66614 :     markov->root = passgen_markov_node_insert_word(
     246             :         markov->root,
     247             :         sequence,
     248       33307 :         markov->level + 1,
     249             :         weight);
     250       33307 :     markov->count += weight;
     251       33307 : }
     252             : 
     253             : // Add a word to a markov chain
     254        3091 : void passgen_markov_add(
     255             :     passgen_markov *markov,
     256             :     const uint32_t *word,
     257             :     size_t word_len,
     258        3091 :     size_t weight) {
     259             :     // insert start sequences
     260        3091 :     size_t sequence_len = 2 * markov->level + 1;
     261        3091 :     uint32_t sequence[sequence_len];
     262        3091 :     memset(sequence, 0, sizeof(sequence[0]) * sequence_len);
     263        3091 :     memcpy(
     264        3091 :         &sequence[markov->level],
     265             :         word,
     266        3091 :         sizeof(uint32_t) * MIN(markov->level, word_len));
     267       15877 :     for(size_t i = 0; i < MIN(markov->level, word_len); i++) {
     268       12786 :         passgen_markov_insert(markov, &sequence[i], weight);
     269             :     }
     270       20521 :     for(size_t i = 0; i < SATURATING_SUB(word_len, markov->level); i++) {
     271       17430 :         passgen_markov_insert(markov, &word[i], weight);
     272             :     }
     273        3091 :     memset(sequence, 0, sizeof(sequence[0]) * sequence_len);
     274        3091 :     memcpy(
     275        3091 :         &sequence[SATURATING_SUB(markov->level, word_len)],
     276        3091 :         &word[SATURATING_SUB(word_len, markov->level)],
     277        3091 :         sizeof(uint32_t) * MIN(markov->level, word_len));
     278        3091 :     passgen_markov_insert(markov, &sequence[0], weight);
     279        3091 : }
     280             : 
     281             : // Free a markov chain
     282          14 : void passgen_markov_free(passgen_markov *markov) {
     283          14 :     passgen_markov_node_free(markov->root, markov->level);
     284          14 :     PASSGEN_CLEAR(markov);
     285          14 : }
     286             : 
     287       13000 : uint32_t passgen_markov_generate(
     288             :     passgen_markov *markov,
     289             :     const uint32_t *current,
     290             :     passgen_random *random,
     291             :     double *entropy) {
     292       13000 :     passgen_markov_node *node = markov->root;
     293             : 
     294       52000 :     for(size_t i = 0; i < markov->level; i++) {
     295       39000 :         node =
     296       39000 :             (passgen_markov_node *) passgen_markov_node_child(node, current[i])
     297             :                 .leaf;
     298       39000 :         assert(node);
     299             :     }
     300             : 
     301       13000 :     passgen_markov_leaf *leaf = (passgen_markov_leaf *) node;
     302             : 
     303             :     // record total choices.
     304       13000 :     if(entropy) {
     305           0 :         *entropy *= (double) leaf->total_count;
     306             :     }
     307             : 
     308       13000 :     size_t choice = passgen_random_u64_max(random, leaf->total_count);
     309    43198589 :     for(size_t i = 0; i < leaf->capacity; i++) {
     310    43198589 :         uint32_t count = passgen_markov_leaf_count_raw(leaf, i);
     311    43198589 :         if(count) {
     312      512513 :             if(choice < count) {
     313             :                 // record bucket (reduces total choices).
     314       13000 :                 if(entropy) {
     315           0 :                     *entropy /= (double) count;
     316             :                 }
     317             : 
     318       13000 :                 return passgen_markov_leaf_codepoint_raw(leaf, i);
     319             :             } else {
     320      499513 :                 choice -= count;
     321             :             }
     322             :         }
     323             :     }
     324             : 
     325           0 :     assert(false);
     326             : 
     327             :     return 0;
     328             : }
     329             : 
     330             : uint32_t
     331      212000 : passgen_markov_leaf_count(passgen_markov_leaf *leaf, uint32_t codepoint) {
     332      212000 :     if(passgen_markov_leaf_codepoint_raw(leaf, codepoint) == codepoint) {
     333      101007 :         return passgen_markov_leaf_count_raw(leaf, codepoint);
     334             :     } else {
     335      110993 :         return 0;
     336             :     }
     337             : }

Generated by: LCOV version 1.14