LCOV - code coverage report
Current view: top level - src/util - utf8.c (source / functions) Hit Total Coverage
Test: passgen-test.info Lines: 59 72 81.9 %
Date: 2024-11-29 06:05:05 Functions: 5 5 100.0 %

          Line data    Source code
       1             : #include "passgen/util/utf8.h"
       2             : 
       3             : // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
       4             : // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
       5             : 
       6             : #define UTF8_ACCEPT 0
       7             : #define UTF8_REJECT 1
       8             : 
       9             : // clang-format off
      10             : static const uint8_t utf8d[] = {
      11             :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
      12             :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
      13             :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
      14             :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
      15             :   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
      16             :   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
      17             :   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
      18             :   0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
      19             :   0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
      20             :   0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
      21             :   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
      22             :   1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
      23             :   1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
      24             :   1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
      25             : };
      26             : // clang-format on
      27             : 
      28             : inline static uint32_t
      29      238333 : utf8_decode(uint32_t *state, uint32_t *codep, uint32_t byte) {
      30      238333 :     uint32_t type = utf8d[byte];
      31             : 
      32      476689 :     *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
      33      238333 :                                      : (0xff >> type) & (byte);
      34             : 
      35      238333 :     *state = utf8d[256 + *state * 16 + type];
      36      238333 :     return *state;
      37             : }
      38             : 
      39       20792 : int passgen_utf8_decode(
      40             :     uint32_t **output,
      41             :     size_t output_len,
      42             :     uint8_t *output_widths,
      43             :     const uint8_t **input,
      44             :     size_t input_len) {
      45       20792 :     uint32_t state = UTF8_ACCEPT;
      46       20792 :     uint32_t codepoint = 0;
      47             : 
      48       20792 :     size_t input_offset = 0;
      49       20792 :     size_t output_offset = 0;
      50       20792 :     size_t codepoint_width = 0;
      51             : 
      52             :     // iterate while we still have input and we still have outputs.
      53      259117 :     while((input_offset < input_len) && (output_offset < output_len)) {
      54             :         uint32_t status =
      55      238333 :             utf8_decode(&state, &codepoint, (*input)[input_offset]);
      56             : 
      57      238333 :         if(status == UTF8_REJECT) {
      58           8 :             return PASSGEN_UTF8_INVALID_CHAR;
      59             :         }
      60             : 
      61      238325 :         if(status == UTF8_ACCEPT) {
      62      238302 :             (*output)[output_offset] = codepoint;
      63             : 
      64      238302 :             if(output_widths) {
      65      236902 :                 output_widths[output_offset] = codepoint_width + 1;
      66             :             }
      67             : 
      68      238302 :             codepoint_width = 0;
      69      238302 :             codepoint = 0;
      70      238302 :             output_offset += 1;
      71             :         } else {
      72          23 :             codepoint_width += 1;
      73             :         }
      74             : 
      75      238325 :         input_offset += 1;
      76             :     }
      77             : 
      78       20784 :     *output += output_offset;
      79       20784 :     *input += input_offset;
      80             : 
      81       20784 :     if(input_offset != input_len) {
      82           2 :         return PASSGEN_UTF8_OUTPUT_SIZE;
      83             :     }
      84             : 
      85       20782 :     return PASSGEN_UTF8_SUCCESS;
      86             : }
      87             : 
      88             : /// Encode a code point using UTF-8
      89             : ///
      90             : /// @author Ondřej Hruška <ondra@ondrovo.com>
      91             : /// @license MIT
      92             : ///
      93             : /// @param out - output buffer (min 4 characters), will be 0-terminated
      94             : /// @param utf - code point 0-0x10FFFF
      95             : /// @return number of bytes on success, 0 on failure (also produces U+FFFD,
      96             : /// which uses 3 bytes)
      97         291 : int passgen_utf8_encode_codepoint(uint8_t *out, uint32_t utf) {
      98         291 :     if(utf <= 0x7F) {
      99             :         // Plain ASCII
     100         288 :         out[0] = (uint8_t) utf;
     101         288 :         return 1;
     102             :     }
     103             : 
     104           3 :     if(utf <= 0x07FF) {
     105             :         // 2-byte unicode
     106           2 :         out[0] = (uint8_t) (((utf >> 6) & 0x1F) | 0xC0);
     107           2 :         out[1] = (uint8_t) (((utf >> 0) & 0x3F) | 0x80);
     108           2 :         return 2;
     109             :     }
     110             : 
     111           1 :     if(utf <= 0xFFFF) {
     112             :         // 3-byte unicode
     113           0 :         out[0] = (uint8_t) (((utf >> 12) & 0x0F) | 0xE0);
     114           0 :         out[1] = (uint8_t) (((utf >> 6) & 0x3F) | 0x80);
     115           0 :         out[2] = (uint8_t) (((utf >> 0) & 0x3F) | 0x80);
     116           0 :         return 3;
     117             :     }
     118             : 
     119           1 :     if(utf <= 0x10FFFF) {
     120             :         // 4-byte unicode
     121           1 :         out[0] = (uint8_t) (((utf >> 18) & 0x07) | 0xF0);
     122           1 :         out[1] = (uint8_t) (((utf >> 12) & 0x3F) | 0x80);
     123           1 :         out[2] = (uint8_t) (((utf >> 6) & 0x3F) | 0x80);
     124           1 :         out[3] = (uint8_t) (((utf >> 0) & 0x3F) | 0x80);
     125           1 :         return 4;
     126             :     }
     127             : 
     128             :     // error
     129           0 :     return PASSGEN_UTF8_INVALID_CHAR;
     130             : }
     131             : 
     132           1 : int passgen_utf8_encode(
     133             :     uint8_t *out,
     134             :     size_t out_len,
     135             :     size_t *out_pos,
     136             :     const uint32_t *in,
     137             :     size_t in_len,
     138             :     size_t *in_pos) {
     139           5 :     while((*in_pos < in_len) && (*out_pos < (out_len - 4))) {
     140           4 :         int ret = passgen_utf8_encode_codepoint(&out[*out_pos], in[*in_pos]);
     141           4 :         if(ret < 0) {
     142           0 :             return ret;
     143             :         }
     144             : 
     145           4 :         *in_pos += 1;
     146           4 :         *out_pos += ret;
     147             :     }
     148             : 
     149           1 :     if(*out_pos < out_len) {
     150           1 :         out[*out_pos] = 0;
     151             :     }
     152             : 
     153           1 :     if(*in_pos != in_len) {
     154           0 :         return PASSGEN_UTF8_OUTPUT_SIZE;
     155             :     }
     156             : 
     157           1 :     return PASSGEN_UTF8_SUCCESS;
     158             : }
     159             : 
     160           8 : const char *passgen_utf8_error(int retval) {
     161           8 :     switch(retval) {
     162           8 :         case PASSGEN_UTF8_INVALID_CHAR:
     163           8 :             return "invalid character";
     164           0 :         case PASSGEN_UTF8_SUCCESS:
     165           0 :             return "success";
     166           0 :         case PASSGEN_UTF8_OUTPUT_SIZE:
     167           0 :             return "output size too small";
     168           0 :         default:
     169           0 :             return "unknown";
     170             :     }
     171             : }

Generated by: LCOV version 1.14