/builds/xfbs/passgen/include/passgen/parser/token.h

Source
/// @file token.h
/// @author Patrick M. Elsen
/// @brief Functions to parse Unicode codepoints into @ref passgen_token values.
///
/// The input to Passgen at this point is a sequence of unicode codepoints.
/// These are converted into tokens. For most codepoints, every codepoint maps
/// one-to-one to a token, but there are some special cases such as unicode
/// escape sequences (for example `\u{0a}` will get you a space).
///
/// Tokens also handle escaping characters. For example, to write a closing
/// bracket in a character range, you can escape it with a backslash (such as
/// `[a-z&=\]]`).
#pragma once
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>

/// Bit that is ORed together with a parsed Unicode codepoint to indicate that
/// it was preceded by a backslash character.
#define PASSGEN_TOKEN_ESCAPED_BIT  (1 << 30)

/// Bitmask for extracting only the Unicode codepoint from the parsed @ref
/// passgen_token.
#define PASSGEN_TOKEN_UNICODE_MASK (1 << 21 - 1)

/// State of the @ref passgen_token_parser.
///
/// Initial state is @ref PASSGEN_TOKEN_INIT, which has the integer value zero.
/// The parser returns to this state after successfully parsing a token.
/// Positive integer values mean that the parser is in the middle of parsing a
/// multi-codepoint token.  Negative values indicate errors, once the parser is
/// in an error state it must be reset.
enum passgen_token_state {
    /// Initial state.
    PASSGEN_TOKEN_INIT = 0,
    /// Has parsed a backslash (`\`) and is currently parsing an escape sequence.
    PASSGEN_TOKEN_ESCAPED,
    /// Currently parsing a unicode escape sequence (`\u`).
    PASSGEN_TOKEN_UNICODE,
    /// Currently parsing the payload of a unicode escape sequence.
    PASSGEN_TOKEN_UNICODE_PAYLOAD,
    /// Error while parsing unicode escape sequence.
    PASSGEN_TOKEN_ERROR_UNICODE_START = -1,
    /// Error while parsing unicode escape sequence payload.
    PASSGEN_TOKEN_ERROR_UNICODE_PAYLOAD = -2,
    /// While parsing a unicode escape sequence, encountered a payload that was too large.
    PASSGEN_TOKEN_ERROR_UNICODE_PAYLOAD_LEN = -3
};

/// Parser state for tokenizer.
///
/// The parser keeps track of it's current state (@ref state). It keeps track
/// of how many codepoints (@ref offset) and bytes (@ref byte_offset) have been
/// parsed so far, and writes this information into the parsed tokens. This
/// makes is possible later to generate useful error messages referencing the
/// exact location in the input that caused the error.  The parser also keeps
/// some state (@ref unicode_payload) for parsing unicode literals. These allow
/// writing Unicode characters in ASCII by specifying them in hexadecimal
/// notation. For example, a space could be written as `\x{0a}`. To make this
/// possible, the parser needs to keep track of the length of the payload (the
/// hexadecimal data), since that has an upper bound, and the current parsed
/// codepoint value.
typedef struct passgen_token_parser {
    /// Current state of the parser.
    enum passgen_token_state state;
    /// Current codepoint offset in parsing.
    size_t offset;
    /// Current byte offset.
    size_t byte_offset;
    union {
        struct {
            /// Length of the payload parsed so far in Unicode escape sequence.
            size_t length;
            /// Current parsed codepoint.
            uint32_t codepoint;
        } unicode_payload;
    } data;
} passgen_token_parser;

/// Parsed passgen token.
///
/// Contains the parsed Unicode @ref codepoint. If it was preceded by a
/// backslash character, it will be binary ORed with @ref
/// PASSGEN_TOKEN_ESCAPED_BIT. In that case, to extract only the codepoint, you
/// can mask it with @ref PASSGEN_TOKEN_UNICODE_MASK: `token.codepoint &
/// PASSGEN_TOKEN_UNICODE_MASK`.
///
/// Also contains the byte offset of this token in the input string (useful if
/// the input is UTF-8 encoded) and the codepoint offset of this token.
typedef struct passgen_token {
    /// Codepoint
    uint32_t codepoint;
    /// Offset of this token (in codepoints)
    size_t offset;
    /// Offset of this token (in bytes)
    size_t byte_offset;
} passgen_token;

/// Initialize a @ref passgen_token_parser.
///
/// @memberof passgen_token_parser
void passgen_token_parser_init(struct passgen_token_parser *token_parser);

/// Convert a @ref passgen_token_state into a string.
///
/// @memberof passgen_token_state
const char *passgen_token_state_string(enum passgen_token_state state);

/// Parse a single codepoint.
///
/// This function parses a single codepoint into a `passgen_token`. It needs an
/// initialized parser to do so, and will return a value indicating if the
/// parsing was successful or not.  It can also be passed a width, which
/// indicates the byte width of the codepoint (if it was decoded from a UTF-8
/// string) which is used to keep track of the exact byte offset for every
/// token to make diagnostic error reporting easier.
///
/// @param parser An initialized `passgen_token_parser`. Use `passgen_token_parser_init` to
///   initialize it if you are not sure.
/// @param token The token that is parsed, this is an output.
/// @param width The width (in bytes) of the codepoint. This can be set to 1 if the original
///   input was Unicode codepoints.
/// @param codepoint The codepoint to parse.
/// @return The status of the parsing. This is an @ref passgen_token_state enum value.
/// @memberof passgen_token_parser
int passgen_token_parse(
    struct passgen_token_parser *parser,
    struct passgen_token *token,
    uint8_t width,
    uint32_t codepoint);

/// Parse a bunch of codepoints from an array. The variable size should contain
/// the number of codepoints and the number of slots in the token array (should
/// be the same).
///
/// It reads `size` amount of codepoints from the codepoints array, and writes
/// up to `size` amount of tokens into the tokens array.
///
/// If the return value is zero or positive, it was a success, and `size` is
/// filled with the amount of tokens in the tokens array.
///
/// @memberof passgen_token_parser
int passgen_token_parse_str(
    struct passgen_token_parser *parser,
    size_t *size,
    struct passgen_token *token,
    uint32_t *codepoints);

/// Get error string for parse error
///
/// @memberof passgen_token_parser
const char *passgen_token_parse_error_str(int ret);

Coverage Report

Created: 2024-05-03 06:05

Line	Count	Source
1		/// @file token.h
2		/// @author Patrick M. Elsen
3		/// @brief Functions to parse Unicode codepoints into @ref passgen_token values.
4		///
5		/// The input to Passgen at this point is a sequence of unicode codepoints.
6		/// These are converted into tokens. For most codepoints, every codepoint maps
7		/// one-to-one to a token, but there are some special cases such as unicode
8		/// escape sequences (for example `\u{0a}` will get you a space).
9		///
10		/// Tokens also handle escaping characters. For example, to write a closing
11		/// bracket in a character range, you can escape it with a backslash (such as
12		/// `[a-z&=\]]`).
13		#pragma once
14		#include <stdbool.h>
15		#include <stddef.h>
16		#include <stdint.h>
17
18		/// Bit that is ORed together with a parsed Unicode codepoint to indicate that
19		/// it was preceded by a backslash character.
20	1.47M	#define PASSGEN_TOKEN_ESCAPED_BIT (1 << 30)
21
22		/// Bitmask for extracting only the Unicode codepoint from the parsed @ref
23		/// passgen_token.
24		#define PASSGEN_TOKEN_UNICODE_MASK (1 << 21 - 1)
25
26		/// State of the @ref passgen_token_parser.
27		///
28		/// Initial state is @ref PASSGEN_TOKEN_INIT, which has the integer value zero.
29		/// The parser returns to this state after successfully parsing a token.
30		/// Positive integer values mean that the parser is in the middle of parsing a
31		/// multi-codepoint token. Negative values indicate errors, once the parser is
32		/// in an error state it must be reset.
33		enum passgen_token_state {
34		/// Initial state.
35		PASSGEN_TOKEN_INIT = 0,
36		/// Has parsed a backslash (`\`) and is currently parsing an escape sequence.
37		PASSGEN_TOKEN_ESCAPED,
38		/// Currently parsing a unicode escape sequence (`\u`).
39		PASSGEN_TOKEN_UNICODE,
40		/// Currently parsing the payload of a unicode escape sequence.
41		PASSGEN_TOKEN_UNICODE_PAYLOAD,
42		/// Error while parsing unicode escape sequence.
43		PASSGEN_TOKEN_ERROR_UNICODE_START = -1,
44		/// Error while parsing unicode escape sequence payload.
45		PASSGEN_TOKEN_ERROR_UNICODE_PAYLOAD = -2,
46		/// While parsing a unicode escape sequence, encountered a payload that was too large.
47		PASSGEN_TOKEN_ERROR_UNICODE_PAYLOAD_LEN = -3
48		};
49
50		/// Parser state for tokenizer.
51		///
52		/// The parser keeps track of it's current state (@ref state). It keeps track
53		/// of how many codepoints (@ref offset) and bytes (@ref byte_offset) have been
54		/// parsed so far, and writes this information into the parsed tokens. This
55		/// makes is possible later to generate useful error messages referencing the
56		/// exact location in the input that caused the error. The parser also keeps
57		/// some state (@ref unicode_payload) for parsing unicode literals. These allow
58		/// writing Unicode characters in ASCII by specifying them in hexadecimal
59		/// notation. For example, a space could be written as `\x{0a}`. To make this
60		/// possible, the parser needs to keep track of the length of the payload (the
61		/// hexadecimal data), since that has an upper bound, and the current parsed
62		/// codepoint value.
63		typedef struct passgen_token_parser {
64		/// Current state of the parser.
65		enum passgen_token_state state;
66		/// Current codepoint offset in parsing.
67		size_t offset;
68		/// Current byte offset.
69		size_t byte_offset;
70		union {
71		struct {
72		/// Length of the payload parsed so far in Unicode escape sequence.
73		size_t length;
74		/// Current parsed codepoint.
75		uint32_t codepoint;
76		} unicode_payload;
77		} data;
78		} passgen_token_parser;
79
80		/// Parsed passgen token.
81		///
82		/// Contains the parsed Unicode @ref codepoint. If it was preceded by a
83		/// backslash character, it will be binary ORed with @ref
84		/// PASSGEN_TOKEN_ESCAPED_BIT. In that case, to extract only the codepoint, you
85		/// can mask it with @ref PASSGEN_TOKEN_UNICODE_MASK: `token.codepoint &
86		/// PASSGEN_TOKEN_UNICODE_MASK`.
87		///
88		/// Also contains the byte offset of this token in the input string (useful if
89		/// the input is UTF-8 encoded) and the codepoint offset of this token.
90		typedef struct passgen_token {
91		/// Codepoint
92		uint32_t codepoint;
93		/// Offset of this token (in codepoints)
94		size_t offset;
95		/// Offset of this token (in bytes)
96		size_t byte_offset;
97		} passgen_token;
98
99		/// Initialize a @ref passgen_token_parser.
100		///
101		/// @memberof passgen_token_parser
102		void passgen_token_parser_init(struct passgen_token_parser *token_parser);
103
104		/// Convert a @ref passgen_token_state into a string.
105		///
106		/// @memberof passgen_token_state
107		const char *passgen_token_state_string(enum passgen_token_state state);
108
109		/// Parse a single codepoint.
110		///
111		/// This function parses a single codepoint into a `passgen_token`. It needs an
112		/// initialized parser to do so, and will return a value indicating if the
113		/// parsing was successful or not. It can also be passed a width, which
114		/// indicates the byte width of the codepoint (if it was decoded from a UTF-8
115		/// string) which is used to keep track of the exact byte offset for every
116		/// token to make diagnostic error reporting easier.
117		///
118		/// @param parser An initialized `passgen_token_parser`. Use `passgen_token_parser_init` to
119		/// initialize it if you are not sure.
120		/// @param token The token that is parsed, this is an output.
121		/// @param width The width (in bytes) of the codepoint. This can be set to 1 if the original
122		/// input was Unicode codepoints.
123		/// @param codepoint The codepoint to parse.
124		/// @return The status of the parsing. This is an @ref passgen_token_state enum value.
125		/// @memberof passgen_token_parser
126		int passgen_token_parse(
127		struct passgen_token_parser *parser,
128		struct passgen_token *token,
129		uint8_t width,
130		uint32_t codepoint);
131
132		/// Parse a bunch of codepoints from an array. The variable size should contain
133		/// the number of codepoints and the number of slots in the token array (should
134		/// be the same).
135		///
136		/// It reads `size` amount of codepoints from the codepoints array, and writes
137		/// up to `size` amount of tokens into the tokens array.
138		///
139		/// If the return value is zero or positive, it was a success, and `size` is
140		/// filled with the amount of tokens in the tokens array.
141		///
142		/// @memberof passgen_token_parser
143		int passgen_token_parse_str(
144		struct passgen_token_parser *parser,
145		size_t *size,
146		struct passgen_token *token,
147		uint32_t *codepoints);
148
149		/// Get error string for parse error
150		///
151		/// @memberof passgen_token_parser
152		const char *passgen_token_parse_error_str(int ret);