Line data Source code
1 : #include "passgen/util/utf8.h"
2 :
3 : // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
4 : // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
5 :
6 : #define UTF8_ACCEPT 0
7 : #define UTF8_REJECT 1
8 :
9 : // clang-format off
10 : static const uint8_t utf8d[] = {
11 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
12 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
13 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
14 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
15 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
16 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
17 : 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
18 : 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
19 : 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
20 : 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
21 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
22 : 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
23 : 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
24 : 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
25 : };
26 : // clang-format on
27 :
28 : inline static uint32_t
29 238333 : utf8_decode(uint32_t *state, uint32_t *codep, uint32_t byte) {
30 238333 : uint32_t type = utf8d[byte];
31 :
32 476689 : *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
33 238333 : : (0xff >> type) & (byte);
34 :
35 238333 : *state = utf8d[256 + *state * 16 + type];
36 238333 : return *state;
37 : }
38 :
39 20792 : int passgen_utf8_decode(
40 : uint32_t **output,
41 : size_t output_len,
42 : uint8_t *output_widths,
43 : const uint8_t **input,
44 : size_t input_len) {
45 20792 : uint32_t state = UTF8_ACCEPT;
46 20792 : uint32_t codepoint = 0;
47 :
48 20792 : size_t input_offset = 0;
49 20792 : size_t output_offset = 0;
50 20792 : size_t codepoint_width = 0;
51 :
52 : // iterate while we still have input and we still have outputs.
53 259117 : while((input_offset < input_len) && (output_offset < output_len)) {
54 : uint32_t status =
55 238333 : utf8_decode(&state, &codepoint, (*input)[input_offset]);
56 :
57 238333 : if(status == UTF8_REJECT) {
58 8 : return PASSGEN_UTF8_INVALID_CHAR;
59 : }
60 :
61 238325 : if(status == UTF8_ACCEPT) {
62 238302 : (*output)[output_offset] = codepoint;
63 :
64 238302 : if(output_widths) {
65 236902 : output_widths[output_offset] = codepoint_width + 1;
66 : }
67 :
68 238302 : codepoint_width = 0;
69 238302 : codepoint = 0;
70 238302 : output_offset += 1;
71 : } else {
72 23 : codepoint_width += 1;
73 : }
74 :
75 238325 : input_offset += 1;
76 : }
77 :
78 20784 : *output += output_offset;
79 20784 : *input += input_offset;
80 :
81 20784 : if(input_offset != input_len) {
82 2 : return PASSGEN_UTF8_OUTPUT_SIZE;
83 : }
84 :
85 20782 : return PASSGEN_UTF8_SUCCESS;
86 : }
87 :
88 : /// Encode a code point using UTF-8
89 : ///
90 : /// @author OndÃ
Âej HruÃ
¡ka <ondra@ondrovo.com>
91 : /// @license MIT
92 : ///
93 : /// @param out - output buffer (min 4 characters), will be 0-terminated
94 : /// @param utf - code point 0-0x10FFFF
95 : /// @return number of bytes on success, 0 on failure (also produces U+FFFD,
96 : /// which uses 3 bytes)
97 291 : int passgen_utf8_encode_codepoint(uint8_t *out, uint32_t utf) {
98 291 : if(utf <= 0x7F) {
99 : // Plain ASCII
100 288 : out[0] = (uint8_t) utf;
101 288 : return 1;
102 : }
103 :
104 3 : if(utf <= 0x07FF) {
105 : // 2-byte unicode
106 2 : out[0] = (uint8_t) (((utf >> 6) & 0x1F) | 0xC0);
107 2 : out[1] = (uint8_t) (((utf >> 0) & 0x3F) | 0x80);
108 2 : return 2;
109 : }
110 :
111 1 : if(utf <= 0xFFFF) {
112 : // 3-byte unicode
113 0 : out[0] = (uint8_t) (((utf >> 12) & 0x0F) | 0xE0);
114 0 : out[1] = (uint8_t) (((utf >> 6) & 0x3F) | 0x80);
115 0 : out[2] = (uint8_t) (((utf >> 0) & 0x3F) | 0x80);
116 0 : return 3;
117 : }
118 :
119 1 : if(utf <= 0x10FFFF) {
120 : // 4-byte unicode
121 1 : out[0] = (uint8_t) (((utf >> 18) & 0x07) | 0xF0);
122 1 : out[1] = (uint8_t) (((utf >> 12) & 0x3F) | 0x80);
123 1 : out[2] = (uint8_t) (((utf >> 6) & 0x3F) | 0x80);
124 1 : out[3] = (uint8_t) (((utf >> 0) & 0x3F) | 0x80);
125 1 : return 4;
126 : }
127 :
128 : // error
129 0 : return PASSGEN_UTF8_INVALID_CHAR;
130 : }
131 :
132 1 : int passgen_utf8_encode(
133 : uint8_t *out,
134 : size_t out_len,
135 : size_t *out_pos,
136 : const uint32_t *in,
137 : size_t in_len,
138 : size_t *in_pos) {
139 5 : while((*in_pos < in_len) && (*out_pos < (out_len - 4))) {
140 4 : int ret = passgen_utf8_encode_codepoint(&out[*out_pos], in[*in_pos]);
141 4 : if(ret < 0) {
142 0 : return ret;
143 : }
144 :
145 4 : *in_pos += 1;
146 4 : *out_pos += ret;
147 : }
148 :
149 1 : if(*out_pos < out_len) {
150 1 : out[*out_pos] = 0;
151 : }
152 :
153 1 : if(*in_pos != in_len) {
154 0 : return PASSGEN_UTF8_OUTPUT_SIZE;
155 : }
156 :
157 1 : return PASSGEN_UTF8_SUCCESS;
158 : }
159 :
160 8 : const char *passgen_utf8_error(int retval) {
161 8 : switch(retval) {
162 8 : case PASSGEN_UTF8_INVALID_CHAR:
163 8 : return "invalid character";
164 0 : case PASSGEN_UTF8_SUCCESS:
165 0 : return "success";
166 0 : case PASSGEN_UTF8_OUTPUT_SIZE:
167 0 : return "output size too small";
168 0 : default:
169 0 : return "unknown";
170 : }
171 : }
|