/builds/xfbs/passgen/include/passgen/util/utf8.h
Line | Count | Source |
1 | | /// @file utf8.h |
2 | | /// @author Patrick M. Elsen <pelsen@xfbs.net> |
3 | | /// @brief UTF-8 encoding and decoding utilities. |
4 | | /// |
5 | | /// Internally, passgen has proper Unicode support (meaning that it works with UTF-32 |
6 | | /// codepoints). However, for transport and storage, UTF-8 is more common. Therefore, |
7 | | /// these functions are needed to convert from UTF-8 to Unicode codepoints, and vice |
8 | | /// versa. |
9 | | #pragma once |
10 | | #include <stddef.h> |
11 | | #include <stdint.h> |
12 | | |
13 | 2 | #define PASSGEN_UTF8_OUTPUT_SIZE 1 |
14 | 54.5k | #define PASSGEN_UTF8_SUCCESS 0 |
15 | 16 | #define PASSGEN_UTF8_INVALID_CHAR -1 |
16 | | |
17 | 160k | #define PASSGEN_UNICODE_MAX 0x10FFFF |
18 | | |
19 | | /// Decodes a UTF-8 character sequence into an output array. |
20 | | /// |
21 | | /// Decodes the UTF-8 character sequence in `input` (with length `input_len`) into |
22 | | /// the Unicode codepoint array `output` (with length `output_len`). The number of |
23 | | /// bytes processed from the input is written into the value pointed to by `input_pos`, |
24 | | /// and the amount of codepoints written into `output` is written into the value |
25 | | /// pointed to by `output_pos`. If `output_widths` is supplied, then for every decoded |
26 | | /// codepoint, the byte width of the codepoint is written into it. |
27 | | /// |
28 | | /// @param output The decoded unicode output. This must point to a uint32_t array |
29 | | /// of size `output_len`. |
30 | | /// @param output_len The length of `output`, and if it is supplied, `output_widths`. |
31 | | /// @param output_pos A pointer to a `size_t variable containing the current offset |
32 | | /// into the `output` array. The value of that variable should initially be zero. |
33 | | /// At the end, this variable indicates how many unicode codepoints were written |
34 | | /// into the output. |
35 | | /// @param output_widths A pointer to an array of `uint8_t`, which will contain |
36 | | /// information of the byte width of every parsed unicode codepoint. If a `NULL` |
37 | | /// pointer is passed, this behaviour is disabled. |
38 | | /// @param input The UTF-8 input. |
39 | | /// @param input_len The length of the UTF-8 input sequence. |
40 | | /// @param input_pos Pointer to a `size_t` containing the current offset into the |
41 | | /// input array. The value of this variable should initially be set to zero. After |
42 | | /// returning, it will indicate the number of processed bytes from the input. |
43 | | /// @return 0 on success, a positive integer when the output is too small to fit the |
44 | | /// decoded codepoints, and a negative integer on error. |
45 | | int passgen_utf8_decode( |
46 | | uint32_t **output, |
47 | | size_t output_len, |
48 | | uint8_t *output_widths, |
49 | | const uint8_t **input, |
50 | | size_t input_len); |
51 | | |
52 | | /// Encodes unicode characters into a UTF-8 character sequence. |
53 | | int passgen_utf8_encode( |
54 | | uint8_t *out, |
55 | | size_t out_len, |
56 | | size_t *out_pos, |
57 | | const uint32_t *in, |
58 | | size_t in_len, |
59 | | size_t *in_pos); |
60 | | |
61 | | /// Encode a single codepoint. |
62 | | int passgen_utf8_encode_codepoint(uint8_t *out, uint32_t codepoint); |
63 | | |
64 | | /// Given an error (negative return value), returns a string describing it. |
65 | | const char *passgen_utf8_error(int retval); |