/builds/xfbs/passgen/include/passgen/parser/token.h
Line | Count | Source |
1 | | /// @file token.h |
2 | | /// @author Patrick M. Elsen |
3 | | /// @brief Functions to parse Unicode codepoints into @ref passgen_token values. |
4 | | /// |
5 | | /// The input to Passgen at this point is a sequence of unicode codepoints. |
6 | | /// These are converted into tokens. For most codepoints, every codepoint maps |
7 | | /// one-to-one to a token, but there are some special cases such as unicode |
8 | | /// escape sequences (for example `\u{0a}` will get you a space). |
9 | | /// |
10 | | /// Tokens also handle escaping characters. For example, to write a closing |
11 | | /// bracket in a character range, you can escape it with a backslash (such as |
12 | | /// `[a-z&=\]]`). |
13 | | #pragma once |
14 | | #include <stdbool.h> |
15 | | #include <stddef.h> |
16 | | #include <stdint.h> |
17 | | |
18 | | /// Bit that is ORed together with a parsed Unicode codepoint to indicate that |
19 | | /// it was preceded by a backslash character. |
20 | 1.47M | #define PASSGEN_TOKEN_ESCAPED_BIT (1 << 30) |
21 | | |
22 | | /// Bitmask for extracting only the Unicode codepoint from the parsed @ref |
23 | | /// passgen_token. |
24 | | #define PASSGEN_TOKEN_UNICODE_MASK (1 << 21 - 1) |
25 | | |
26 | | /// State of the @ref passgen_token_parser. |
27 | | /// |
28 | | /// Initial state is @ref PASSGEN_TOKEN_INIT, which has the integer value zero. |
29 | | /// The parser returns to this state after successfully parsing a token. |
30 | | /// Positive integer values mean that the parser is in the middle of parsing a |
31 | | /// multi-codepoint token. Negative values indicate errors, once the parser is |
32 | | /// in an error state it must be reset. |
33 | | enum passgen_token_state { |
34 | | /// Initial state. |
35 | | PASSGEN_TOKEN_INIT = 0, |
36 | | /// Has parsed a backslash (`\`) and is currently parsing an escape sequence. |
37 | | PASSGEN_TOKEN_ESCAPED, |
38 | | /// Currently parsing a unicode escape sequence (`\u`). |
39 | | PASSGEN_TOKEN_UNICODE, |
40 | | /// Currently parsing the payload of a unicode escape sequence. |
41 | | PASSGEN_TOKEN_UNICODE_PAYLOAD, |
42 | | /// Error while parsing unicode escape sequence. |
43 | | PASSGEN_TOKEN_ERROR_UNICODE_START = -1, |
44 | | /// Error while parsing unicode escape sequence payload. |
45 | | PASSGEN_TOKEN_ERROR_UNICODE_PAYLOAD = -2, |
46 | | /// While parsing a unicode escape sequence, encountered a payload that was too large. |
47 | | PASSGEN_TOKEN_ERROR_UNICODE_PAYLOAD_LEN = -3 |
48 | | }; |
49 | | |
50 | | /// Parser state for tokenizer. |
51 | | /// |
52 | | /// The parser keeps track of it's current state (@ref state). It keeps track |
53 | | /// of how many codepoints (@ref offset) and bytes (@ref byte_offset) have been |
54 | | /// parsed so far, and writes this information into the parsed tokens. This |
55 | | /// makes is possible later to generate useful error messages referencing the |
56 | | /// exact location in the input that caused the error. The parser also keeps |
57 | | /// some state (@ref unicode_payload) for parsing unicode literals. These allow |
58 | | /// writing Unicode characters in ASCII by specifying them in hexadecimal |
59 | | /// notation. For example, a space could be written as `\x{0a}`. To make this |
60 | | /// possible, the parser needs to keep track of the length of the payload (the |
61 | | /// hexadecimal data), since that has an upper bound, and the current parsed |
62 | | /// codepoint value. |
63 | | typedef struct passgen_token_parser { |
64 | | /// Current state of the parser. |
65 | | enum passgen_token_state state; |
66 | | /// Current codepoint offset in parsing. |
67 | | size_t offset; |
68 | | /// Current byte offset. |
69 | | size_t byte_offset; |
70 | | union { |
71 | | struct { |
72 | | /// Length of the payload parsed so far in Unicode escape sequence. |
73 | | size_t length; |
74 | | /// Current parsed codepoint. |
75 | | uint32_t codepoint; |
76 | | } unicode_payload; |
77 | | } data; |
78 | | } passgen_token_parser; |
79 | | |
80 | | /// Parsed passgen token. |
81 | | /// |
82 | | /// Contains the parsed Unicode @ref codepoint. If it was preceded by a |
83 | | /// backslash character, it will be binary ORed with @ref |
84 | | /// PASSGEN_TOKEN_ESCAPED_BIT. In that case, to extract only the codepoint, you |
85 | | /// can mask it with @ref PASSGEN_TOKEN_UNICODE_MASK: `token.codepoint & |
86 | | /// PASSGEN_TOKEN_UNICODE_MASK`. |
87 | | /// |
88 | | /// Also contains the byte offset of this token in the input string (useful if |
89 | | /// the input is UTF-8 encoded) and the codepoint offset of this token. |
90 | | typedef struct passgen_token { |
91 | | /// Codepoint |
92 | | uint32_t codepoint; |
93 | | /// Offset of this token (in codepoints) |
94 | | size_t offset; |
95 | | /// Offset of this token (in bytes) |
96 | | size_t byte_offset; |
97 | | } passgen_token; |
98 | | |
99 | | /// Initialize a @ref passgen_token_parser. |
100 | | /// |
101 | | /// @memberof passgen_token_parser |
102 | | void passgen_token_parser_init(struct passgen_token_parser *token_parser); |
103 | | |
104 | | /// Convert a @ref passgen_token_state into a string. |
105 | | /// |
106 | | /// @memberof passgen_token_state |
107 | | const char *passgen_token_state_string(enum passgen_token_state state); |
108 | | |
109 | | /// Parse a single codepoint. |
110 | | /// |
111 | | /// This function parses a single codepoint into a `passgen_token`. It needs an |
112 | | /// initialized parser to do so, and will return a value indicating if the |
113 | | /// parsing was successful or not. It can also be passed a width, which |
114 | | /// indicates the byte width of the codepoint (if it was decoded from a UTF-8 |
115 | | /// string) which is used to keep track of the exact byte offset for every |
116 | | /// token to make diagnostic error reporting easier. |
117 | | /// |
118 | | /// @param parser An initialized `passgen_token_parser`. Use `passgen_token_parser_init` to |
119 | | /// initialize it if you are not sure. |
120 | | /// @param token The token that is parsed, this is an output. |
121 | | /// @param width The width (in bytes) of the codepoint. This can be set to 1 if the original |
122 | | /// input was Unicode codepoints. |
123 | | /// @param codepoint The codepoint to parse. |
124 | | /// @return The status of the parsing. This is an @ref passgen_token_state enum value. |
125 | | /// @memberof passgen_token_parser |
126 | | int passgen_token_parse( |
127 | | struct passgen_token_parser *parser, |
128 | | struct passgen_token *token, |
129 | | uint8_t width, |
130 | | uint32_t codepoint); |
131 | | |
132 | | /// Parse a bunch of codepoints from an array. The variable size should contain |
133 | | /// the number of codepoints and the number of slots in the token array (should |
134 | | /// be the same). |
135 | | /// |
136 | | /// It reads `size` amount of codepoints from the codepoints array, and writes |
137 | | /// up to `size` amount of tokens into the tokens array. |
138 | | /// |
139 | | /// If the return value is zero or positive, it was a success, and `size` is |
140 | | /// filled with the amount of tokens in the tokens array. |
141 | | /// |
142 | | /// @memberof passgen_token_parser |
143 | | int passgen_token_parse_str( |
144 | | struct passgen_token_parser *parser, |
145 | | size_t *size, |
146 | | struct passgen_token *token, |
147 | | uint32_t *codepoints); |
148 | | |
149 | | /// Get error string for parse error |
150 | | /// |
151 | | /// @memberof passgen_token_parser |
152 | | const char *passgen_token_parse_error_str(int ret); |