Coverage Report

Created: 2024-05-03 06:05

/builds/xfbs/passgen/src/util/utf8.c
Line
Count
Source (jump to first uncovered line)
1
#include "passgen/util/utf8.h"
2
3
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
4
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
5
6
496k
#define UTF8_ACCEPT 0
7
237k
#define UTF8_REJECT 1
8
9
// clang-format off
10
static const uint8_t utf8d[] = {
11
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
12
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
13
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
14
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
15
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
16
  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
17
  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
18
  0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
19
  0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
20
  0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
21
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
22
  1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
23
  1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
24
  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
25
};
26
// clang-format on
27
28
inline static uint32_t
29
237k
utf8_decode(uint32_t *state, uint32_t *codep, uint32_t byte) {
30
237k
    uint32_t type = utf8d[byte];
31
237k
32
237k
    *codep = (*state != UTF8_ACCEPT) ? 
(byte & 0x3fu) | (*codep << 6)23
33
237k
                                     : 
(0xff >> type) & (byte)237k
;
34
237k
35
237k
    *state = utf8d[256 + *state * 16 + type];
36
237k
    return *state;
37
237k
}
38
39
int passgen_utf8_decode(
40
    uint32_t **output,
41
    size_t output_len,
42
    uint8_t *output_widths,
43
    const uint8_t **input,
44
20.7k
    size_t input_len) {
45
20.7k
    uint32_t state = UTF8_ACCEPT;
46
20.7k
    uint32_t codepoint = 0;
47
20.7k
48
20.7k
    size_t input_offset = 0;
49
20.7k
    size_t output_offset = 0;
50
20.7k
    size_t codepoint_width = 0;
51
20.7k
52
20.7k
    // iterate while we still have input and we still have outputs.
53
258k
    while((input_offset < input_len) && 
(output_offset < output_len)237k
) {
54
237k
        uint32_t status =
55
237k
            utf8_decode(&state, &codepoint, (*input)[input_offset]);
56
237k
57
237k
        if(status == UTF8_REJECT) {
58
8
            return PASSGEN_UTF8_INVALID_CHAR;
59
8
        }
60
237k
61
237k
        if(status == UTF8_ACCEPT) {
62
237k
            (*output)[output_offset] = codepoint;
63
237k
64
237k
            if(output_widths) {
65
236k
                output_widths[output_offset] = codepoint_width + 1;
66
236k
            }
67
237k
68
237k
            codepoint_width = 0;
69
237k
            codepoint = 0;
70
237k
            output_offset += 1;
71
237k
        } else {
72
23
            codepoint_width += 1;
73
23
        }
74
237k
75
237k
        input_offset += 1;
76
237k
    }
77
20.7k
78
20.7k
    *output += output_offset;
79
20.7k
    *input += input_offset;
80
20.7k
81
20.7k
    if(input_offset != input_len) {
82
2
        return PASSGEN_UTF8_OUTPUT_SIZE;
83
2
    }
84
20.7k
85
20.7k
    return PASSGEN_UTF8_SUCCESS;
86
20.7k
}
87
88
/// Encode a code point using UTF-8
89
///
90
/// @author Ondřej Hruška <ondra@ondrovo.com>
91
/// @license MIT
92
///
93
/// @param out - output buffer (min 4 characters), will be 0-terminated
94
/// @param utf - code point 0-0x10FFFF
95
/// @return number of bytes on success, 0 on failure (also produces U+FFFD,
96
/// which uses 3 bytes)
97
286
int passgen_utf8_encode_codepoint(uint8_t *out, uint32_t utf) {
98
286
    if(utf <= 0x7F) {
99
283
        // Plain ASCII
100
283
        out[0] = (uint8_t) utf;
101
283
        return 1;
102
283
    }
103
3
104
3
    if(utf <= 0x07FF) {
105
2
        // 2-byte unicode
106
2
        out[0] = (uint8_t) (((utf >> 6) & 0x1F) | 0xC0);
107
2
        out[1] = (uint8_t) (((utf >> 0) & 0x3F) | 0x80);
108
2
        return 2;
109
2
    }
110
1
111
1
    if(utf <= 0xFFFF) {
112
0
        // 3-byte unicode
113
0
        out[0] = (uint8_t) (((utf >> 12) & 0x0F) | 0xE0);
114
0
        out[1] = (uint8_t) (((utf >> 6) & 0x3F) | 0x80);
115
0
        out[2] = (uint8_t) (((utf >> 0) & 0x3F) | 0x80);
116
0
        return 3;
117
0
    }
118
1
119
1
    if(utf <= 0x10FFFF) {
120
1
        // 4-byte unicode
121
1
        out[0] = (uint8_t) (((utf >> 18) & 0x07) | 0xF0);
122
1
        out[1] = (uint8_t) (((utf >> 12) & 0x3F) | 0x80);
123
1
        out[2] = (uint8_t) (((utf >> 6) & 0x3F) | 0x80);
124
1
        out[3] = (uint8_t) (((utf >> 0) & 0x3F) | 0x80);
125
1
        return 4;
126
1
    }
127
0
128
0
    // error
129
0
    return PASSGEN_UTF8_INVALID_CHAR;
130
0
}
131
132
int passgen_utf8_encode(
133
    uint8_t *out,
134
    size_t out_len,
135
    size_t *out_pos,
136
    const uint32_t *in,
137
    size_t in_len,
138
1
    size_t *in_pos) {
139
5
    while((*in_pos < in_len) && 
(*out_pos < (out_len - 4))4
) {
140
4
        int ret = passgen_utf8_encode_codepoint(&out[*out_pos], in[*in_pos]);
141
4
        if(ret < 0) {
142
0
            return ret;
143
0
        }
144
4
145
4
        *in_pos += 1;
146
4
        *out_pos += ret;
147
4
    }
148
1
149
1
    if(*out_pos < out_len) {
150
1
        out[*out_pos] = 0;
151
1
    }
152
1
153
1
    if(*in_pos != in_len) {
154
0
        return PASSGEN_UTF8_OUTPUT_SIZE;
155
0
    }
156
1
157
1
    return PASSGEN_UTF8_SUCCESS;
158
1
}
159
160
8
const char *passgen_utf8_error(int retval) {
161
8
    switch(retval) {
162
8
        case PASSGEN_UTF8_INVALID_CHAR:
163
8
            return "invalid character";
164
0
        case PASSGEN_UTF8_SUCCESS:
165
0
            return "success";
166
0
        case PASSGEN_UTF8_OUTPUT_SIZE:
167
0
            return "output size too small";
168
0
        default:
169
0
            return "unknown";
170
8
    }
171
8
}