/builds/xfbs/passgen/src/util/utf8.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "passgen/util/utf8.h" |
2 | | |
3 | | // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> |
4 | | // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. |
5 | | |
6 | 496k | #define UTF8_ACCEPT 0 |
7 | 237k | #define UTF8_REJECT 1 |
8 | | |
9 | | // clang-format off |
10 | | static const uint8_t utf8d[] = { |
11 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f |
12 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f |
13 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f |
14 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f |
15 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f |
16 | | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf |
17 | | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df |
18 | | 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef |
19 | | 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff |
20 | | 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 |
21 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 |
22 | | 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 |
23 | | 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 |
24 | | 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 |
25 | | }; |
26 | | // clang-format on |
27 | | |
28 | | inline static uint32_t |
29 | 237k | utf8_decode(uint32_t *state, uint32_t *codep, uint32_t byte) { |
30 | 237k | uint32_t type = utf8d[byte]; |
31 | 237k | |
32 | 237k | *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)23 |
33 | 237k | : (0xff >> type) & (byte)237k ; |
34 | 237k | |
35 | 237k | *state = utf8d[256 + *state * 16 + type]; |
36 | 237k | return *state; |
37 | 237k | } |
38 | | |
39 | | int passgen_utf8_decode( |
40 | | uint32_t **output, |
41 | | size_t output_len, |
42 | | uint8_t *output_widths, |
43 | | const uint8_t **input, |
44 | 20.7k | size_t input_len) { |
45 | 20.7k | uint32_t state = UTF8_ACCEPT; |
46 | 20.7k | uint32_t codepoint = 0; |
47 | 20.7k | |
48 | 20.7k | size_t input_offset = 0; |
49 | 20.7k | size_t output_offset = 0; |
50 | 20.7k | size_t codepoint_width = 0; |
51 | 20.7k | |
52 | 20.7k | // iterate while we still have input and we still have outputs. |
53 | 258k | while((input_offset < input_len) && (output_offset < output_len)237k ) { |
54 | 237k | uint32_t status = |
55 | 237k | utf8_decode(&state, &codepoint, (*input)[input_offset]); |
56 | 237k | |
57 | 237k | if(status == UTF8_REJECT) { |
58 | 8 | return PASSGEN_UTF8_INVALID_CHAR; |
59 | 8 | } |
60 | 237k | |
61 | 237k | if(status == UTF8_ACCEPT) { |
62 | 237k | (*output)[output_offset] = codepoint; |
63 | 237k | |
64 | 237k | if(output_widths) { |
65 | 236k | output_widths[output_offset] = codepoint_width + 1; |
66 | 236k | } |
67 | 237k | |
68 | 237k | codepoint_width = 0; |
69 | 237k | codepoint = 0; |
70 | 237k | output_offset += 1; |
71 | 237k | } else { |
72 | 23 | codepoint_width += 1; |
73 | 23 | } |
74 | 237k | |
75 | 237k | input_offset += 1; |
76 | 237k | } |
77 | 20.7k | |
78 | 20.7k | *output += output_offset; |
79 | 20.7k | *input += input_offset; |
80 | 20.7k | |
81 | 20.7k | if(input_offset != input_len) { |
82 | 2 | return PASSGEN_UTF8_OUTPUT_SIZE; |
83 | 2 | } |
84 | 20.7k | |
85 | 20.7k | return PASSGEN_UTF8_SUCCESS; |
86 | 20.7k | } |
87 | | |
88 | | /// Encode a code point using UTF-8 |
89 | | /// |
90 | | /// @author OndÃ
Âej HruÃ
¡ka <ondra@ondrovo.com> |
91 | | /// @license MIT |
92 | | /// |
93 | | /// @param out - output buffer (min 4 characters), will be 0-terminated |
94 | | /// @param utf - code point 0-0x10FFFF |
95 | | /// @return number of bytes on success, 0 on failure (also produces U+FFFD, |
96 | | /// which uses 3 bytes) |
97 | 286 | int passgen_utf8_encode_codepoint(uint8_t *out, uint32_t utf) { |
98 | 286 | if(utf <= 0x7F) { |
99 | 283 | // Plain ASCII |
100 | 283 | out[0] = (uint8_t) utf; |
101 | 283 | return 1; |
102 | 283 | } |
103 | 3 | |
104 | 3 | if(utf <= 0x07FF) { |
105 | 2 | // 2-byte unicode |
106 | 2 | out[0] = (uint8_t) (((utf >> 6) & 0x1F) | 0xC0); |
107 | 2 | out[1] = (uint8_t) (((utf >> 0) & 0x3F) | 0x80); |
108 | 2 | return 2; |
109 | 2 | } |
110 | 1 | |
111 | 1 | if(utf <= 0xFFFF) { |
112 | 0 | // 3-byte unicode |
113 | 0 | out[0] = (uint8_t) (((utf >> 12) & 0x0F) | 0xE0); |
114 | 0 | out[1] = (uint8_t) (((utf >> 6) & 0x3F) | 0x80); |
115 | 0 | out[2] = (uint8_t) (((utf >> 0) & 0x3F) | 0x80); |
116 | 0 | return 3; |
117 | 0 | } |
118 | 1 | |
119 | 1 | if(utf <= 0x10FFFF) { |
120 | 1 | // 4-byte unicode |
121 | 1 | out[0] = (uint8_t) (((utf >> 18) & 0x07) | 0xF0); |
122 | 1 | out[1] = (uint8_t) (((utf >> 12) & 0x3F) | 0x80); |
123 | 1 | out[2] = (uint8_t) (((utf >> 6) & 0x3F) | 0x80); |
124 | 1 | out[3] = (uint8_t) (((utf >> 0) & 0x3F) | 0x80); |
125 | 1 | return 4; |
126 | 1 | } |
127 | 0 | |
128 | 0 | // error |
129 | 0 | return PASSGEN_UTF8_INVALID_CHAR; |
130 | 0 | } |
131 | | |
132 | | int passgen_utf8_encode( |
133 | | uint8_t *out, |
134 | | size_t out_len, |
135 | | size_t *out_pos, |
136 | | const uint32_t *in, |
137 | | size_t in_len, |
138 | 1 | size_t *in_pos) { |
139 | 5 | while((*in_pos < in_len) && (*out_pos < (out_len - 4))4 ) { |
140 | 4 | int ret = passgen_utf8_encode_codepoint(&out[*out_pos], in[*in_pos]); |
141 | 4 | if(ret < 0) { |
142 | 0 | return ret; |
143 | 0 | } |
144 | 4 | |
145 | 4 | *in_pos += 1; |
146 | 4 | *out_pos += ret; |
147 | 4 | } |
148 | 1 | |
149 | 1 | if(*out_pos < out_len) { |
150 | 1 | out[*out_pos] = 0; |
151 | 1 | } |
152 | 1 | |
153 | 1 | if(*in_pos != in_len) { |
154 | 0 | return PASSGEN_UTF8_OUTPUT_SIZE; |
155 | 0 | } |
156 | 1 | |
157 | 1 | return PASSGEN_UTF8_SUCCESS; |
158 | 1 | } |
159 | | |
160 | 8 | const char *passgen_utf8_error(int retval) { |
161 | 8 | switch(retval) { |
162 | 8 | case PASSGEN_UTF8_INVALID_CHAR: |
163 | 8 | return "invalid character"; |
164 | 0 | case PASSGEN_UTF8_SUCCESS: |
165 | 0 | return "success"; |
166 | 0 | case PASSGEN_UTF8_OUTPUT_SIZE: |
167 | 0 | return "output size too small"; |
168 | 0 | default: |
169 | 0 | return "unknown"; |
170 | 8 | } |
171 | 8 | } |