Coverage Report

Created: 2024-05-03 06:05

/builds/xfbs/passgen/include/passgen/parser/token.h
Line
Count
Source
1
/// @file token.h
2
/// @author Patrick M. Elsen
3
/// @brief Functions to parse Unicode codepoints into @ref passgen_token values.
4
///
5
/// The input to Passgen at this point is a sequence of unicode codepoints.
6
/// These are converted into tokens. For most codepoints, every codepoint maps
7
/// one-to-one to a token, but there are some special cases such as unicode
8
/// escape sequences (for example `\u{0a}` will get you a space).
9
///
10
/// Tokens also handle escaping characters. For example, to write a closing
11
/// bracket in a character range, you can escape it with a backslash (such as
12
/// `[a-z&=\]]`).
13
#pragma once
14
#include <stdbool.h>
15
#include <stddef.h>
16
#include <stdint.h>
17
18
/// Bit that is ORed together with a parsed Unicode codepoint to indicate that
19
/// it was preceded by a backslash character.
20
1.47M
#define PASSGEN_TOKEN_ESCAPED_BIT  (1 << 30)
21
22
/// Bitmask for extracting only the Unicode codepoint from the parsed @ref
23
/// passgen_token.
24
#define PASSGEN_TOKEN_UNICODE_MASK (1 << 21 - 1)
25
26
/// State of the @ref passgen_token_parser.
27
///
28
/// Initial state is @ref PASSGEN_TOKEN_INIT, which has the integer value zero.
29
/// The parser returns to this state after successfully parsing a token.
30
/// Positive integer values mean that the parser is in the middle of parsing a
31
/// multi-codepoint token.  Negative values indicate errors, once the parser is
32
/// in an error state it must be reset.
33
enum passgen_token_state {
34
    /// Initial state.
35
    PASSGEN_TOKEN_INIT = 0,
36
    /// Has parsed a backslash (`\`) and is currently parsing an escape sequence.
37
    PASSGEN_TOKEN_ESCAPED,
38
    /// Currently parsing a unicode escape sequence (`\u`).
39
    PASSGEN_TOKEN_UNICODE,
40
    /// Currently parsing the payload of a unicode escape sequence.
41
    PASSGEN_TOKEN_UNICODE_PAYLOAD,
42
    /// Error while parsing unicode escape sequence.
43
    PASSGEN_TOKEN_ERROR_UNICODE_START = -1,
44
    /// Error while parsing unicode escape sequence payload.
45
    PASSGEN_TOKEN_ERROR_UNICODE_PAYLOAD = -2,
46
    /// While parsing a unicode escape sequence, encountered a payload that was too large.
47
    PASSGEN_TOKEN_ERROR_UNICODE_PAYLOAD_LEN = -3
48
};
49
50
/// Parser state for tokenizer.
51
///
52
/// The parser keeps track of it's current state (@ref state). It keeps track
53
/// of how many codepoints (@ref offset) and bytes (@ref byte_offset) have been
54
/// parsed so far, and writes this information into the parsed tokens. This
55
/// makes is possible later to generate useful error messages referencing the
56
/// exact location in the input that caused the error.  The parser also keeps
57
/// some state (@ref unicode_payload) for parsing unicode literals. These allow
58
/// writing Unicode characters in ASCII by specifying them in hexadecimal
59
/// notation. For example, a space could be written as `\x{0a}`. To make this
60
/// possible, the parser needs to keep track of the length of the payload (the
61
/// hexadecimal data), since that has an upper bound, and the current parsed
62
/// codepoint value.
63
typedef struct passgen_token_parser {
64
    /// Current state of the parser.
65
    enum passgen_token_state state;
66
    /// Current codepoint offset in parsing.
67
    size_t offset;
68
    /// Current byte offset.
69
    size_t byte_offset;
70
    union {
71
        struct {
72
            /// Length of the payload parsed so far in Unicode escape sequence.
73
            size_t length;
74
            /// Current parsed codepoint.
75
            uint32_t codepoint;
76
        } unicode_payload;
77
    } data;
78
} passgen_token_parser;
79
80
/// Parsed passgen token.
81
///
82
/// Contains the parsed Unicode @ref codepoint. If it was preceded by a
83
/// backslash character, it will be binary ORed with @ref
84
/// PASSGEN_TOKEN_ESCAPED_BIT. In that case, to extract only the codepoint, you
85
/// can mask it with @ref PASSGEN_TOKEN_UNICODE_MASK: `token.codepoint &
86
/// PASSGEN_TOKEN_UNICODE_MASK`.
87
///
88
/// Also contains the byte offset of this token in the input string (useful if
89
/// the input is UTF-8 encoded) and the codepoint offset of this token.
90
typedef struct passgen_token {
91
    /// Codepoint
92
    uint32_t codepoint;
93
    /// Offset of this token (in codepoints)
94
    size_t offset;
95
    /// Offset of this token (in bytes)
96
    size_t byte_offset;
97
} passgen_token;
98
99
/// Initialize a @ref passgen_token_parser.
100
///
101
/// @memberof passgen_token_parser
102
void passgen_token_parser_init(struct passgen_token_parser *token_parser);
103
104
/// Convert a @ref passgen_token_state into a string.
105
///
106
/// @memberof passgen_token_state
107
const char *passgen_token_state_string(enum passgen_token_state state);
108
109
/// Parse a single codepoint.
110
///
111
/// This function parses a single codepoint into a `passgen_token`. It needs an
112
/// initialized parser to do so, and will return a value indicating if the
113
/// parsing was successful or not.  It can also be passed a width, which
114
/// indicates the byte width of the codepoint (if it was decoded from a UTF-8
115
/// string) which is used to keep track of the exact byte offset for every
116
/// token to make diagnostic error reporting easier.
117
///
118
/// @param parser An initialized `passgen_token_parser`. Use `passgen_token_parser_init` to
119
///   initialize it if you are not sure.
120
/// @param token The token that is parsed, this is an output.
121
/// @param width The width (in bytes) of the codepoint. This can be set to 1 if the original
122
///   input was Unicode codepoints.
123
/// @param codepoint The codepoint to parse.
124
/// @return The status of the parsing. This is an @ref passgen_token_state enum value.
125
/// @memberof passgen_token_parser
126
int passgen_token_parse(
127
    struct passgen_token_parser *parser,
128
    struct passgen_token *token,
129
    uint8_t width,
130
    uint32_t codepoint);
131
132
/// Parse a bunch of codepoints from an array. The variable size should contain
133
/// the number of codepoints and the number of slots in the token array (should
134
/// be the same).
135
///
136
/// It reads `size` amount of codepoints from the codepoints array, and writes
137
/// up to `size` amount of tokens into the tokens array.
138
///
139
/// If the return value is zero or positive, it was a success, and `size` is
140
/// filled with the amount of tokens in the tokens array.
141
///
142
/// @memberof passgen_token_parser
143
int passgen_token_parse_str(
144
    struct passgen_token_parser *parser,
145
    size_t *size,
146
    struct passgen_token *token,
147
    uint32_t *codepoints);
148
149
/// Get error string for parse error
150
///
151
/// @memberof passgen_token_parser
152
const char *passgen_token_parse_error_str(int ret);