diff --git a/norminette/__main__.py b/norminette/__main__.py index 2b6509e..264dcca 100644 --- a/norminette/__main__.py +++ b/norminette/__main__.py @@ -7,7 +7,7 @@ import argparse from norminette.errors import formatters from norminette.file import File -from norminette.lexer import Lexer, TokenError +from norminette.lexer import Lexer from norminette.exceptions import CParsingError from norminette.registry import Registry from norminette.context import Context @@ -127,10 +127,10 @@ def main(): for file in files: try: lexer = Lexer(file) - tokens = lexer.get_tokens() + tokens = list(lexer) context = Context(file, tokens, debug, args.R) registry.run(context) - except (TokenError, CParsingError) as e: + except CParsingError as e: print(file.path + f": Error!\n\t{colors(e.msg, 'red')}") sys.exit(1) except KeyboardInterrupt: diff --git a/norminette/lexer/__init__.py b/norminette/lexer/__init__.py index 1f219c5..1d14758 100644 --- a/norminette/lexer/__init__.py +++ b/norminette/lexer/__init__.py @@ -1,5 +1,4 @@ from norminette.lexer.lexer import Lexer -from norminette.lexer.lexer import TokenError from norminette.lexer.tokens import Token -__all__ = ["Lexer", "TokenError", "Token"] +__all__ = ["Lexer", "Token"] diff --git a/norminette/lexer/dictionary.py b/norminette/lexer/dictionary.py index 0aa57aa..a971e12 100644 --- a/norminette/lexer/dictionary.py +++ b/norminette/lexer/dictionary.py @@ -108,6 +108,7 @@ ">>": "RIGHT_SHIFT", "<<": "LEFT_SHIFT", "?": "TERN_CONDITION", + "#": "HASH", } brackets = { diff --git a/norminette/lexer/lexer.py b/norminette/lexer/lexer.py index d715abf..a817d27 100644 --- a/norminette/lexer/lexer.py +++ b/norminette/lexer/lexer.py @@ -1,3 +1,4 @@ +import re import string from typing import Optional, Tuple @@ -10,27 +11,83 @@ from norminette.file import File from norminette.errors import Error, Highlight as H + +def c(a: str, b: str): + a = a.lower() + b = b.lower() + return ( + a + b, a.upper() + b, a + b.upper(), a.upper() + b.upper(), + b + a, b.upper() + a, b + a.upper(), b.upper() + a.upper(), + ) + + octal_digits = "01234567" hexadecimal_digits = "0123456789abcdefABCDEF" +integer_suffixes = ( + '', + *"uUlLzZ", + "ll", "LL", + "wb", "WB", + "i64", "I64", + *c('u', 'l'), + *c('u', "ll"), + *c('u', 'z'), + *c('u', "wb"), + *c('u', "i64"), +) + +INT_LITERAL_PATTERN = re.compile(r""" +^ +# (?P[-+]*) +(?P # prefix can be + 0[bBxX]* # 0, 0b, 0B, 0x, 0X, 0bb, 0BB, ... + | # or empty +) +(?P + # BUG If prefix is followed by two or more x, it doesn't works correctly + (?<=0[xX]) # is prefix for hex digits? + [\da-fA-F]+ # so, collect hex digits + | # otherwise + \d+ # collect decimal digits +) +(?P + (?<=[eE]) # is constant ending with an `E`? + [\w\d+\-.]* # so, collect `+` and `-` operators + | # otherwise + \w # collect suffixes that starts with an letter + [\w\d.]* # and letters, digits and dots that follows it + | # finally, do suffix be optional (empty) +) +""", re.VERBOSE) + +_float_pattern = r""" + ^ + (?P{0}) + (?P + (?: + [eE]+[-+]\d+ + |[eE]+\d+ + |(?:[eE][+-]?(?:[.\d]+)?)+ + ){1} + ) + (?P[\w\d._]*|) +""" + +FLOAT_EXPONENT_LITERAL_PATTERN = re.compile(_float_pattern.format(r"\d+", ''), re.VERBOSE) +FLOAT_FRACTIONAL_LITERAL_PATTERN = re.compile(_float_pattern.format(r"(?:\d+)?\.\d+|\d+\.", '?'), re.VERBOSE) class Lexer: def __init__(self, file: File): self.file = file - self.src = file.source - self.len = len(file.source) self.__pos = int(0) self.__line_pos = self.__line = 1 - self.tokens = [] - - def peek_sub_string(self, size): - return self.src[self.__pos : self.__pos + size] def raw_peek(self, *, offset: int = 0, collect: int = 1): assert collect > 0 and offset >= 0 - if (pos := self.__pos + offset) < self.len: - return ''.join(self.src[pos:pos+collect]) + if (pos := self.__pos + offset) < len(self.file.source): + return ''.join(self.file.source[pos:pos+collect]) return None def peek(self, *, offset: int = 0) -> Optional[Tuple[str, int]]: @@ -126,107 +183,115 @@ def pop( result += char return result - def peek_char(self): - """Return current character being checked, - if the character is a backslash character the following - character is appended to the return value. It will allow us to - parse escaped characters easier. - """ - char = None - if self.__pos < self.len: - char = self.src[self.__pos] - if self.src[self.__pos] == "\\": - char = self.src[self.__pos : self.__pos + 2] - return char - - def pop_char(self, skip_escaped=True): - """Pop a character that's been read by increasing self.__pos, - for escaped characters self.__pos will be increased twice - """ - if self.peek_char() == "\t": - self.__line_pos += 4 - (self.__line_pos - 1 & 3) - else: - self.__line_pos += len(self.peek_char()) - if self.__pos < self.len and skip_escaped and self.src[self.__pos] == "\\": - self.__pos += 1 - self.__pos += 1 - return self.peek_char() - - def peek_token(self): - return self.tokens[-1] - def line_pos(self): return self.__line, self.__line_pos - def is_string(self): - """True if current character could start a string constant""" - return self.raw_peek(collect=2) == 'L"' or self.raw_peek() == '"' - - def is_constant(self): - """True if current character could start a numeric constant""" - if self.peek_char() in string.digits: - return True - elif self.peek_char() == ".": - for i in range(0, self.len - self.__pos): - if self.src[self.__pos + i] == ".": - i += 1 - elif self.src[self.__pos + i] in "0123456789": - return True - else: - return False + def parse_char_literal(self) -> Optional[Token]: + if self.raw_peek(collect=2) != "L'" and self.raw_peek() != '\'': + return + pos = lineno, column = self.line_pos() + value = self.pop() + chars = 0 + if value == 'L': + value += self.pop() + for _ in range(100): + try: + char = self.pop(use_escape=True) + except UnexpectedEOF: + error = Error.from_name("UNEXPECTED_EOF_CHR", highlights=[ + H(lineno, column, length=len(value)), + ]) + self.file.errors.add(error) + break + if char == '\n': + error = Error.from_name("UNEXPECTED_EOL_CHR", highlights=[ + H(lineno, column, length=len(value)), + H(lineno, column + len(value), length=1, hint="Perhaps you forgot a single quote (')?") + ]) + self.file.errors.add(error) + break + value += char + if char == '\'': + break + chars += 1 else: - return False - - def is_char_constant(self): - """True if current character could start a character constant""" - return self.raw_peek() == "'" or self.raw_peek(collect=2) == "L'" - - def string(self): + raise MaybeInfiniteLoop() + if value == "''": + error = Error.from_name("EMPTY_CHAR", highlights=[H(*pos, length=2)]) + self.file.errors.add(error) + if chars > 1 and value.endswith('\''): + error = Error.from_name("CHAR_AS_STRING", highlights=[ + H(*pos, length=len(value)), + H(*pos, length=1, + hint="Perhaps you want a string (double quote, \") instead of a char (single quote, ')?"), + ]) + self.file.errors.add(error) + return Token("CHAR_CONST", pos, value=value) + + def parse_string_literal(self): """String constants can contain any characer except unescaped newlines. An unclosed string or unescaped newline is a fatal error and thus parsing will stop here. """ - pos = self.line_pos() - tkn_value = "" - if self.peek_char() == "L": - tkn_value += self.peek_char() - self.pop_char() - tkn_value += self.peek_char() - self.pop_char() - while self.peek_char() not in [None]: - tkn_value += self.peek_char() - if self.peek_sub_string(2) == "\\\n": - self.__line += 1 - self.__line_pos = 1 - if self.peek_char() == '"': + if not self.peek(): + return + if self.raw_peek() != '"' and self.raw_peek(collect=2) != "L\"": + return + pos = lineno, column = self.line_pos() + val = self.pop() + if val != '"': + val += self.pop() + while self.peek() is not None: + char = self.pop(use_escape=True) + val += char + if char == '"': break - if self.peek_char() == '\n': - raise TokenError(pos, f"String literal unterminated detected at line {pos[0]}") - self.pop_char() else: - raise TokenError(pos) + error = Error.from_name("UNEXPECTED_EOF_STR") + error.add_highlight(*pos, length=len(val)) + error.add_highlight(lineno, column + len(val), length=1, hint="Perhaps you forgot a double quote (\")?") + self.file.errors.add(error) + return Token("STRING", pos, val) + + def parse_integer_literal(self): + # TODO Add to support single quote (') to separate digits according to C23 + + match = INT_LITERAL_PATTERN.match(self.file.source[self.__pos:]) + if match is None: return - self.tokens.append(Token("STRING", pos, tkn_value)) - self.pop_char() - def char_constant(self): - """Char constants follow pretty much the same rule as string constants""" - pos = self.line_pos() - tkn_value = "'" - self.pop_char() - while self.peek_char(): - tkn_value += self.peek_char() - if self.peek_char() == "\n": - self.pop_char() - raise TokenError(pos) - if self.peek_char() == "'": - self.pop_char() - self.tokens.append(Token("CHAR_CONST", pos, tkn_value)) - return - self.pop_char() - raise TokenError(pos) - - def constant(self): + pos = lineno, column = self.line_pos() + token = Token("CONSTANT", pos, slice := self.pop(times=match.end())) + + if match["Suffix"] not in integer_suffixes: + suffix_length = len(match["Suffix"]) + string_length = len(slice) - suffix_length + if match["Suffix"][0] in "+-": + error = Error.from_name("MAXIMAL_MUNCH") + error.add_highlight(lineno, column + string_length, length=1, hint="Perhaps you forgot a space ( )?") + else: + error = Error.from_name("INVALID_SUFFIX") + error.add_highlight(lineno, column + string_length, length=suffix_length) + self.file.errors.add(error) + + def _check_bad_prefix(name: str, bucket: str): + error = Error.from_name(f"INVALID_{name}_INT") + for index, char in enumerate(match["Constant"], start=len(match["Prefix"])): + if char not in bucket: + error.add_highlight(lineno, column + index, length=1) + if error.highlights: + self.file.errors.add(error) + + if match["Prefix"] in ("0b", "0B"): + _check_bad_prefix("BIN", "01") + elif match["Prefix"] == '0': + _check_bad_prefix("OCT", "01234567") + elif match["Prefix"] in ("0x", "0X"): + _check_bad_prefix("HEX", "0123456789abcdefABCDEF") + + return token + + def parse_float_literal(self): """Numeric constants can take many forms: - integer constants only allow digits [0-9] - real number constant only allow digits [0-9], @@ -244,155 +309,57 @@ def constant(self): a numeric constant could start with a '.' (dot character) """ - pos = self.line_pos() - tkn_value = "" - bucket = ".0123456789aAbBcCdDeEfFlLuUxX-+" - while self.peek_char() and ( - self.peek_char() in bucket or self.peek_char() == "\\\n" - ): - if self.peek_char() in "xX": - if tkn_value.startswith("0") is False or len(tkn_value) > 1: - raise TokenError(pos) - for c in "xX": - if c in tkn_value: - raise TokenError(pos) - - elif self.peek_char() in "bB": - if ( - tkn_value != "0" - and tkn_value.startswith("0x") is False - and tkn_value.startswith("0X") is False - ): - raise TokenError(pos) - - elif self.peek_char() in "+-": - if ( - tkn_value.endswith("e") is False - and tkn_value.endswith("E") is False - or self.peek_sub_string(2) in ["++", "--"] - ): - break - - elif ( - self.peek_char() in "eE" - and "0x" not in tkn_value - and "0X" not in tkn_value - ): - if ( - "e" in tkn_value - or "E" in tkn_value - or "f" in tkn_value - or "F" in tkn_value - or "u" in tkn_value - or "U" in tkn_value - or "l" in tkn_value - or "L" in tkn_value - ): - raise TokenError(pos) - - elif self.peek_char() in "lL": - lcount = tkn_value.count("l") + tkn_value.count("L") - if ( - lcount > 1 - or (lcount == 1 and tkn_value[-1] not in "lL") - or ("f" in tkn_value or "F" in tkn_value) - and "0x" not in tkn_value - and "0X" not in tkn_value - ): - raise TokenError(pos) - elif ( - self.peek_char() == "l" - and "L" in tkn_value - or self.peek_char() == "L" - and "l" in tkn_value - ): - raise TokenError(pos) - - elif self.peek_char() in "uU": - if ( - "u" in tkn_value - or "U" in tkn_value - or ( - ( - "e" in tkn_value - or "E" in tkn_value - or "f" in tkn_value - or "F" in tkn_value - ) - and ("0x" not in tkn_value and "0X" not in tkn_value) - ) - ): - raise TokenError(pos) - - elif self.peek_char() in "Ff": - if ( - tkn_value.startswith("0x") is False - and tkn_value.startswith("0X") is False - and ("." not in tkn_value or "f" in tkn_value or "F" in tkn_value) - and "e" not in tkn_value - or "u" in tkn_value - or "U" in tkn_value - or "l" in tkn_value - or "L" in tkn_value - ): - raise TokenError(pos) - - elif ( - self.peek_char() in "aAbBcCdDeE" - and tkn_value.startswith("0x") is False - and tkn_value.startswith("0X") is False - or "u" in tkn_value - or "U" in tkn_value - or "l" in tkn_value - or "L" in tkn_value - ): - raise TokenError(pos) - - elif ( - self.peek_char() in "0123456789" - and "u" in tkn_value - or "U" in tkn_value - or "l" in tkn_value - or "L" in tkn_value - ): - raise TokenError(pos) - - elif self.peek_char() == "." and "." in tkn_value: - raise TokenError(pos) - - tkn_value += self.peek_char() - self.pop_char() - if ( - tkn_value[-1] in "eE" - and tkn_value.startswith("0x") is False - or tkn_value[-1] in "xX" - ): - raise TokenError(pos) - else: - self.tokens.append(Token("CONSTANT", pos, tkn_value)) - - def mult_comment(self): - pos = self.line_pos() + constant = self.raw_peek() + if constant is None: + return + pos = lineno, column = self.line_pos() + src = self.file.source[self.__pos:] + if match := FLOAT_EXPONENT_LITERAL_PATTERN.match(src): + suffix = len(match["Suffix"]) + column += len(match["Constant"]) + error = None + if re.match(r"[eE][-+]?\d+", match["Exponent"]) is None: + error = Error.from_name("BAD_EXPONENT") + error.add_highlight(lineno, column, length=len(match["Exponent"]) + suffix) + elif match["Suffix"] not in ('', *"lLfF"): + error = Error.from_name("BAD_FLOAT_SUFFIX") + error.add_highlight(lineno, column + suffix, length=suffix) + if error: + self.file.errors.add(error) + return Token("CONSTANT", pos, self.pop(times=match.end())) + if match := FLOAT_FRACTIONAL_LITERAL_PATTERN.match(src): + # TODO Continue here lol + return Token("CONSTANT", pos, self.pop(times=match.end())) + + def parse_multi_line_comment(self) -> Optional[Token]: + if self.raw_peek(collect=2) != "/*": + return + pos = lineno, column = self.line_pos() val = self.pop(times=2) - # TODO Add to put `UnexpectedEOF` exception as an error in `file.errors` + eof = False while self.peek(): - # the `.pop(...)` can raise an `UnexpectedEOF` if source is like: - # ```c - # /*\ - # - # ``` - # note the backslash followed by an empty line - val += self.pop(use_spaces=True) + try: + val += self.pop(use_spaces=True) + except UnexpectedEOF: + eof = True + break if val.endswith("*/"): break else: - raise UnexpectedEOF() - self.tokens.append(Token("MULT_COMMENT", pos, val)) - - def comment(self): + eof = True + if eof: + # TODO Add a better highlight since it is a multi-line token + error = Error.from_name("UNEXPECTED_EOF_MC") + error.add_highlight(lineno, column, length=len(val)) + self.file.errors.add(error) + return Token("MULT_COMMENT", pos, val) + + def parse_line_comment(self) -> Optional[Token]: """Comments are anything after '//' characters, up until a newline or end of file """ + if self.raw_peek(collect=2) != "//": + return pos = self.line_pos() val = self.pop(times=2) while self.peek(): @@ -403,161 +370,115 @@ def comment(self): val += self.pop() except UnexpectedEOF: break - self.tokens.append(Token("COMMENT", pos, val)) + return Token("COMMENT", pos, val) - def identifier(self): + def parse_identifier(self) -> Optional[Token]: """Identifiers can start with any letter [a-z][A-Z] or an underscore and contain any letters [a-z][A-Z] digits [0-9] or underscores """ + char = self.raw_peek() + if not char or char not in string.ascii_letters + '_': + return pos = self.line_pos() - tkn_value = "" - while self.peek_char() and ( - self.peek_char() in string.ascii_letters + "0123456789_" - or self.peek_char() == "\\\n" - ): - if self.peek_char() == "\\\n": - self.pop_char() - continue - tkn_value += self.peek_char() - self.pop_char() - if tkn_value in keywords: - self.tokens.append(Token(keywords[tkn_value], pos)) - - else: - self.tokens.append(Token("IDENTIFIER", pos, tkn_value)) + val = self.pop() + while char := self.raw_peek(): + if char not in string.ascii_letters + "0123456789_": + break + val += self.pop() + if val in keywords: + return Token(keywords[val], pos) + return Token("IDENTIFIER", pos, val) - def operator(self): + def parse_operator(self): """Operators can be made of one or more sign, so the longest operators need to be looked up for first in order to avoid false positives eg: '>>' being understood as two 'MORE_THAN' operators instead of one 'RIGHT_SHIFT' operator """ + char = self.raw_peek() + if not char or char not in "+-*/,<>^&|!=%;:.~?#": + return pos = self.line_pos() - if self.peek_char() in ".+-*/%<>^&|!=": - if self.peek_sub_string(3) in [">>=", "<<=", "..."]: - self.tokens.append(Token(operators[self.peek_sub_string(3)], pos)) - self.pop_char(), self.pop_char(), self.pop_char() - - elif self.peek_sub_string(2) in [">>", "<<", "->"]: - self.tokens.append(Token(operators[self.peek_sub_string(2)], pos)) - self.pop_char(), self.pop_char() - - elif self.peek_sub_string(2) == self.peek_char() + "=": - self.tokens.append(Token(operators[self.peek_sub_string(2)], pos)) - self.pop_char(), self.pop_char() - - elif self.peek_char() in "+-<>=&|": - if self.peek_sub_string(2) == self.peek_char() * 2: - self.tokens.append(Token(operators[self.peek_sub_string(2)], pos)) - self.pop_char() - self.pop_char() - - else: - self.tokens.append(Token(operators[self.peek_char()], pos)) - self.pop_char() - - else: - self.tokens.append(Token(operators[self.peek_char()], pos)) - self.pop_char() - - else: - self.tokens.append(Token(operators[self.src[self.__pos]], pos)) - self.pop_char() + if char in ".+-*/%<>^&|!=": + if self.raw_peek(collect=3) in (">>=", "<<=", "..."): + return Token(operators[self.pop(times=3)], pos) + if self.raw_peek(collect=2) in (">>", "<<", "->"): + return Token(operators[self.pop(times=2)], pos) + if self.raw_peek(collect=2) == char + "=": + return Token(operators[self.pop(times=2)], pos) + if char in "+-<>=&|": + if self.raw_peek(collect=2) == char * 2: + return Token(operators[self.pop(times=2)], pos) + char = self.pop() + return Token(operators[char], pos) + + def parse_whitespace(self) -> Optional[Token]: + char = self.raw_peek() + if char is None or char not in "\n\t ": + return + if char == ' ': + token = Token("SPACE", self.line_pos()) + elif char == "\t": + token = Token("TAB", self.line_pos()) + elif char == "\n": + token = Token("NEWLINE", self.line_pos()) + self.pop() + return token + + def parse_brackets(self) -> Optional[Token]: + result = self.peek() + if result is None: + return + char, _ = result + if char not in brackets: + return + start = self.line_pos() + value = self.pop() + return Token(brackets[value], start) + + parsers = ( + parse_float_literal, # Need to be above: + # `parse_operator` to avoid `` + # `parse_integer_literal` to avoid `\d+` + parse_integer_literal, + parse_char_literal, + parse_string_literal, + parse_identifier, # Need to be bellow `char` and `string` + parse_whitespace, + parse_line_comment, + parse_multi_line_comment, + parse_operator, + parse_brackets, + ) def get_next_token(self): """Peeks one character and tries to match it to a token type, if it doesn't match any of the token types, an error will be raised and current file's parsing will stop """ - while self.peek_char() is not None: - if self.is_string(): - self.string() - - elif ( - self.peek_char().isalpha() and self.peek_char().isascii() - ) or self.peek_char() == "_": - self.identifier() - - elif self.is_constant(): - self.constant() - - elif self.is_char_constant(): - self.char_constant() - - elif self.peek_char() == "#": - self.tokens.append(Token("HASH", self.line_pos())) - self.pop_char() - - elif self.src[self.__pos :].startswith("/*"): - self.mult_comment() - - elif self.src[self.__pos :].startswith("//"): - self.comment() - - elif self.peek_char() in "+-*/,<>^&|!=%;:.~?": - self.operator() - - elif self.peek_char() == " ": - self.tokens.append(Token("SPACE", self.line_pos())) - self.pop_char() - - elif self.peek_char() == "\t": - self.tokens.append(Token("TAB", self.line_pos())) - self.pop_char() - - elif self.peek_char() == "\n": # or ord(self.peek_char()) == 8203: - self.tokens.append(Token("NEWLINE", self.line_pos())) - self.pop_char() - self.__line_pos = 1 + while self.raw_peek(): + if self.raw_peek(collect=2) == "\\\n" or self.raw_peek(collect=4) == "??/\n": + # Avoid using `.pop()` here since it ignores the escaped + # newline and pops and upcomes after it. E.g, if we have + # `\\\nab` and use `.pop()`, the parsers funcs will see `b``. + _, size = self.peek() # type: ignore + self.__pos += size + 1 self.__line += 1 - - elif self.peek_char() == "\\\n": - self.tokens.append(Token("ESCAPED_NEWLINE", self.line_pos())) - self.pop_char() self.__line_pos = 1 - self.__line += 1 - - elif self.peek_char() in brackets: - self.tokens.append(Token(brackets[self.peek_char()], self.line_pos())) - self.pop_char() else: - raise TokenError(self.line_pos()) - - return self.peek_token() - - return None - - def get_tokens(self): - """Iterate through self.get_next_token() to convert source code into a - token list - """ - while self.get_next_token(): - continue - return self.tokens - - def print_tokens(self): - if self.tokens == []: - return - for t in self.tokens: - if t.type == "NEWLINE": - print(t) - else: - print(t, end="") - if self.tokens[-1].type != "NEWLINE": - print("") + break + for parser in self.parsers: + if result := parser(self): + return result + if char := self.raw_peek(): + error = Error("BAD_LEXEME", f"No matchable token for '{char}' lexeme") + error.add_highlight(*self.line_pos(), length=1) + self.file.errors.add(error) + self.__pos += 1 + self.__line_pos += 1 + # BUG If we have multiples bad lexemes, it can raise RecursionError + return self.get_next_token() - def check_tokens(self): - """ - Only used for testing - """ - if self.tokens == []: - self.get_tokens() - if self.tokens == []: - return "" - ret = "" - for i in range(0, len(self.tokens)): - ret += self.tokens[i].test() - ret += "" if self.tokens[i].type != "NEWLINE" else "\n" - if self.tokens[-1].type != "NEWLINE": - ret += "\n" - return ret + def __iter__(self): + while token := self.get_next_token(): + yield token diff --git a/norminette/norm_error.py b/norminette/norm_error.py index bbbbaaa..4e8390f 100644 --- a/norminette/norm_error.py +++ b/norminette/norm_error.py @@ -122,6 +122,21 @@ "FORBIDDEN_STRUCT": "Struct declaration are not allowed in .c files", "FORBIDDEN_UNION": "Union declaration are not allowed in .c files", "FORBIDDEN_ENUM": "Enum declaration are not allowed in .c files", + "UNEXPECTED_EOF_CHR": "Unexpected end of file (EOF) while parsing a char", + "UNEXPECTED_EOL_CHR": "Unexpected end of line (EOL) while parsing a char", + "UNEXPECTED_EOF_MC": "Unexpected end of file (EOF) while parsing a multiline comment", + "UNEXPECTED_EOF_STR": "Unexpected end of file (EOF) while parsing a string", + "EMPTY_CHAR": "Empty character constant", + "CHAR_AS_STRING": "Character constants can have only one character", + "INVALID_SUFFIX": "This suffix is invalid", + "BAD_FLOAT_SUFFIX": "Invalid suffix for float/double literal constant", + "INVALID_BIN_INT": "Invalid binary integer literal", + "INVALID_OCT_INT": "Invalid octal integer literal", + "INVALID_HEX_INT": "Invalid hexadecimal integer literal", + "MAXIMAL_MUNCH": "Potential maximal munch detected", + "NO_HEX_DIGITS": "No hexadecimal digits followed by the \\x", + "UNKNOWN_ESCAPE": "Unknown escape sequence", + "BAD_EXPONENT": "Exponent has no digits", } diff --git a/norminette/registry.py b/norminette/registry.py index 3d9b5e1..9e10068 100644 --- a/norminette/registry.py +++ b/norminette/registry.py @@ -58,7 +58,7 @@ def run(self, context): raise CParsingError( f"Error: Unrecognized line {unrecognized_tkns[0].pos} while parsing line {unrecognized_tkns}" # noqa: E501 ) - print("uncaught -> ", context.filename) + print("uncaught -> ", context.file.name) print("uncaught -> ", unrecognized_tkns) unrecognized_tkns = [] context.dprint(rule.name, jump) diff --git a/tests/lexer/brackets_tokens_test.py b/tests/lexer/brackets_tokens_test.py deleted file mode 100644 index 999b7a3..0000000 --- a/tests/lexer/brackets_tokens_test.py +++ /dev/null @@ -1,19 +0,0 @@ -import pytest - -from norminette.file import File -from norminette.lexer import Lexer - -brackets = ( - ('{', "LBRACE"), - ('}', "RBRACE"), - ("(", "LPARENTHESIS"), - (")", "RPARENTHESIS"), - ("[", "LBRACKET"), - ("]", "RBRACKET"), -) - - -@pytest.mark.parametrize("lexeme,name", brackets) -def test_brackets_tokens(lexeme, name): - token = Lexer(File("", lexeme)).get_next_token() - assert token.type == name diff --git a/tests/lexer/char_constant_tokens_test.py b/tests/lexer/char_constant_tokens_test.py deleted file mode 100644 index b21adb7..0000000 --- a/tests/lexer/char_constant_tokens_test.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest - -from norminette.file import File -from norminette.lexer import Lexer, TokenError - -char_constants = ( - ("'*'", ""), - ("'\\n'", ""), - ("'\\042'", ""), - ("'0x042'", ""), - ("'\n1'", None), - ("'\\n\n'", None), - ("'A", None), -) - - -@pytest.mark.parametrize("lexeme,expected", char_constants) -def test_char_constants_tokens(lexeme, expected): - lexer = Lexer(File("", lexeme)) - if expected is None: - with pytest.raises(TokenError): - lexer.get_next_token() - return - token = lexer.get_next_token() - assert token.test() == expected diff --git a/tests/lexer/constant_tokens_test.py b/tests/lexer/constant_tokens_test.py deleted file mode 100644 index 07adbde..0000000 --- a/tests/lexer/constant_tokens_test.py +++ /dev/null @@ -1,48 +0,0 @@ -import pytest - -from norminette.file import File -from norminette.lexer import Lexer, TokenError - -constants = ( - ("42", "\n"), - ("+42", "\n"), - ("-42", "\n"), - ("+-42", "\n"), - ("4.2", "\n"), - (".42", "\n"), - ("4e2", "\n"), - (".4e2", "\n"), - ("4e2f", "\n"), - (".4e2f", "\n"), - ("042", "\n"), - ("0x42", "\n"), - ("-0x4e2", "\n"), - ("42l", "\n"), - ("42ul", "\n"), - ("42ll", "\n"), - ("42ull", "\n"), - ("42u", "\n"), - ( - "-+-+-+-+-+-+-+-0Xe4Ae2", - "" - "\n" - ), - (".e42", "\n"), - ("4.4.4", None), - ("4e4e4", None), - ("4x4x4", None), - ("42uul", None), - ("42Lllu", None), - ("42lul", None), - (".42e", None), -) - - -@pytest.mark.parametrize("lexeme,expected", constants) -def test_constants_tokens(lexeme, expected): - lexer = Lexer(File("", lexeme)) - if expected is None: - with pytest.raises(TokenError): - lexer.get_next_token() - return - assert lexer.check_tokens() == expected diff --git a/tests/lexer/identifiers_tokens_test.py b/tests/lexer/identifiers_tokens_test.py deleted file mode 100644 index ee4e1b3..0000000 --- a/tests/lexer/identifiers_tokens_test.py +++ /dev/null @@ -1,46 +0,0 @@ -import unittest - -from norminette.file import File -from norminette.lexer.lexer import Lexer - - -def eat_tokens(line): - lex = Lexer(File("", line)) - line = "" - while lex.get_next_token(): - line += lex.peek_token().test() - if lex.peek_token().type in ["EOF", "ERROR"]: - break - return line - - -class IdentifiersTokensTest(unittest.TestCase): - def test_simple_identifier(self): - self.assertEqual(eat_tokens("foo"), "") - - def test_underscore_identifier(self): - self.assertEqual(eat_tokens("_foo"), "") - - def test_underscore_with_number_identifier(self): - self.assertEqual(eat_tokens("_foo42"), "") - - def test_double_underscore_with_number_identifier(self): - self.assertEqual(eat_tokens("_foo__42"), "") - - def test_underscore_and_uppercase_identifier(self): - self.assertEqual(eat_tokens("_FOO"), "") - - def test_underscore_at_the_end_and_uppercase_identifier(self): - self.assertEqual(eat_tokens("FOO_"), "") - - def test_identifier_can_not_start_with_a_number(self): - self.assertNotEqual(eat_tokens("5_FOO_"), "") - - def test_identifier_can_not_have_a_space(self): - self.assertNotEqual(eat_tokens("foo 1"), "", - ) diff --git a/tests/lexer/keywords_tokens_test.py b/tests/lexer/keywords_tokens_test.py deleted file mode 100644 index 9d21ad6..0000000 --- a/tests/lexer/keywords_tokens_test.py +++ /dev/null @@ -1,162 +0,0 @@ -import unittest - -from norminette.file import File -from norminette.lexer.lexer import Lexer - - -def eat_tokens(line): - lex = Lexer(File("", line)) - tokens = [] - while lex.get_next_token(): - tokens.append(lex.peek_token().test()) - if len(tokens) == 1: - return tokens[0] - return tokens - - -class TokensKeywordsTest(unittest.TestCase): - def test_auto_keyword(self): - self.assertEqual(eat_tokens("auto"), "") - - def test_break_keyword(self): - self.assertEqual(eat_tokens("break"), "") - - def test_case_keyword(self): - self.assertEqual(eat_tokens("case"), "") - - def test_char_keyword(self): - self.assertEqual(eat_tokens("char"), "") - - def test_const_keyword(self): - self.assertEqual(eat_tokens("const"), "") - - def test_continue_keyword(self): - self.assertEqual(eat_tokens("continue"), "") - - def test_default_keyword(self): - self.assertEqual(eat_tokens("default"), "") - - def test_do_keyword(self): - self.assertEqual(eat_tokens("do"), "") - - def test_double_keyword(self): - self.assertEqual(eat_tokens("double"), "") - - def test_else_keyword(self): - self.assertEqual(eat_tokens("else"), "") - - def test_enum_keyword(self): - self.assertEqual(eat_tokens("enum"), "") - - def test_extern_keyword(self): - self.assertEqual(eat_tokens("extern"), "") - - def test_float_keyword(self): - self.assertEqual(eat_tokens("float"), "") - - def test_for_keyword(self): - self.assertEqual(eat_tokens("for"), "") - - def test_goto_keyword(self): - self.assertEqual(eat_tokens("goto"), "") - - def test_if_keyword(self): - self.assertEqual(eat_tokens("if"), "") - - def test_int_keyword(self): - self.assertEqual(eat_tokens("int"), "") - - def test_long_keyword(self): - self.assertEqual(eat_tokens("long"), "") - - def test_register_keyword(self): - self.assertEqual(eat_tokens("register"), "") - - def test_return_keyword(self): - self.assertEqual(eat_tokens("return"), "") - - def test_signed_keyword(self): - self.assertEqual(eat_tokens("signed"), "") - - def test_sizeof_keyword(self): - self.assertEqual(eat_tokens("sizeof"), "") - - def test_static_keyword(self): - self.assertEqual(eat_tokens("static"), "") - - def test_struct_keyword(self): - self.assertEqual(eat_tokens("struct"), "") - - def test_switch_keyword(self): - self.assertEqual(eat_tokens("switch"), "") - - def test_typedef_keyword(self): - self.assertEqual(eat_tokens("typedef"), "") - - def test_union_keyword(self): - self.assertEqual(eat_tokens("union"), "") - - def test_unsigned_keyword(self): - self.assertEqual(eat_tokens("unsigned"), "") - - def test_void_keyword(self): - self.assertEqual(eat_tokens("void"), "") - - def test_volatile_keyword(self): - self.assertEqual(eat_tokens("volatile"), "") - - def test_while_keyword(self): - self.assertEqual(eat_tokens("while"), "") - - def test_define_keyword(self): - self.assertEqual(eat_tokens("#define"), ["", ""]) - self.assertEqual(eat_tokens("# define "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#define "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#define //bla"), ["", "", "", ""]) - self.assertEqual(eat_tokens("#define//bla "), ["", "", ""]) - - def test_error_keyword(self): - self.assertEqual(eat_tokens("#error"), ["", ""]) - self.assertEqual(eat_tokens("# error "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#error "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#error //bla"), ["", "", "", ""]) - self.assertEqual(eat_tokens("#error//bla "), ["", "", ""]) - - def test_ifndef_keyword(self): - self.assertEqual(eat_tokens("#ifndef"), ["", ""]) - self.assertEqual(eat_tokens("# ifndef "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#ifndef "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#ifndef //bla"), ["", "", "", ""]) - self.assertEqual(eat_tokens("#ifndef//bla "), ["", "", ""]) - - def test_ifdef_keyword(self): - self.assertEqual(eat_tokens("#ifdef"), ["", ""]) - self.assertEqual(eat_tokens("# ifdef "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#ifdef "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#ifdef //bla"), ["", "", "", ""]) - self.assertEqual(eat_tokens("#ifdef//bla "), ["", "", ""]) - - def test_include_keyword(self): - self.assertEqual(eat_tokens("#include"), ["", ""]) - self.assertEqual(eat_tokens("# include "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#include "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#include //bla"), ["", "", "", ""]) - self.assertEqual(eat_tokens("#include//bla "), ["", "", ""]) - - def test_pragma_keyword(self): - self.assertEqual(eat_tokens("#pragma"), ["", ""]) - self.assertEqual(eat_tokens("# pragma "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#pragma "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#pragma //bla"), ["", "", "", ""]) - self.assertEqual(eat_tokens("#pragma//bla "), ["", "", ""]) - - def test_undef_keyword(self): - self.assertEqual(eat_tokens("#undef"), ["", ""]) - self.assertEqual(eat_tokens("# undef "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#undef "), ["", "", "", ""]) - self.assertEqual(eat_tokens("#undef //bla"), ["", "", "", ""]) - self.assertEqual(eat_tokens("#undef//bla "), ["", "", ""]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/lexer/operators_tokens_test.py b/tests/lexer/operators_tokens_test.py deleted file mode 100644 index 781978c..0000000 --- a/tests/lexer/operators_tokens_test.py +++ /dev/null @@ -1,53 +0,0 @@ -import pytest - -from norminette.file import File -from norminette.lexer import Lexer - -operators = ( - (">>=", "RIGHT_ASSIGN"), - ("<<=", "LEFT_ASSIGN"), - ("+=", "ADD_ASSIGN"), - ("-=", "SUB_ASSIGN"), - ("*=", "MUL_ASSIGN"), - ("/=", "DIV_ASSIGN"), - ("%=", "MOD_ASSIGN"), - ("&=", "AND_ASSIGN"), - ("^=", "XOR_ASSIGN"), - ("|=", "OR_ASSIGN"), - ("<=", "LESS_OR_EQUAL"), - (">=", "GREATER_OR_EQUAL"), - ("==", "EQUALS"), - ("!=", "NOT_EQUAL"), - ("=", "ASSIGN"), - (";", "SEMI_COLON"), - (":", "COLON"), - (",", "COMMA"), - (".", "DOT"), - ("!", "NOT"), - ("-", "MINUS"), - ("+", "PLUS"), - ("*", "MULT"), - ("/", "DIV"), - ("%", "MODULO"), - ("<", "LESS_THAN"), - (">", "MORE_THAN"), - ("...", "ELLIPSIS"), - ("++", "INC"), - ("--", "DEC"), - ("->", "PTR"), - ("&&", "AND"), - ("||", "OR"), - ("^", "BWISE_XOR"), - ("|", "BWISE_OR"), - ("~", "BWISE_NOT"), - ("&", "BWISE_AND"), - (">>", "RIGHT_SHIFT"), - ("<<", "LEFT_SHIFT"), - ("?", "TERN_CONDITION"), -) - - -@pytest.mark.parametrize("operator,type", operators) -def test_operators_tokens(operator, type): - token = Lexer(File("", operator)).get_next_token() - assert token.type == type diff --git a/tests/lexer/string_tokens_test.py b/tests/lexer/string_tokens_test.py deleted file mode 100644 index bccbd60..0000000 --- a/tests/lexer/string_tokens_test.py +++ /dev/null @@ -1,17 +0,0 @@ -import pytest - -from norminette.file import File -from norminette.lexer import Lexer - -strings = ( - ('"Basic string"', ''), - ('L"Basic string"', ''), - ('"Basic \\"string\\""', ''), - ('"Escaped \\\\\\"string\\\\\\\\\\"\\\\"', ''), -) - - -@pytest.mark.parametrize("string,expected", strings) -def test_string_tokens(string, expected): - token = Lexer(File("", string)).get_next_token() - assert token.test() == expected diff --git a/tests/rules/rules_generator_test.py b/tests/rules/rules_generator_test.py index f2049ad..979b9e6 100644 --- a/tests/rules/rules_generator_test.py +++ b/tests/rules/rules_generator_test.py @@ -22,7 +22,7 @@ def test_rule_for_file(file, capsys): file = File(file, file_to_lex) lexer = Lexer(file) - context = Context(file, lexer.get_tokens(), debug=2) + context = Context(file, list(lexer), debug=2) registry.run(context) errors = HumanizedErrorsFormatter(file) print(errors) diff --git a/tests/rules/samples/check_preprocessor_define.out b/tests/rules/samples/check_preprocessor_define.out index bbf7d2e..a3415a4 100644 --- a/tests/rules/samples/check_preprocessor_define.out +++ b/tests/rules/samples/check_preprocessor_define.out @@ -17,7 +17,7 @@ check_preprocessor_define.c - IsEmptyLine In "GlobalScope" from "None" line 9": check_preprocessor_define.c - IsPreprocessorStatement In "GlobalScope" from "None" line 10": - + check_preprocessor_define.c - IsEmptyLine In "GlobalScope" from "None" line 12": check_preprocessor_define.c - IsPreprocessorStatement In "GlobalScope" from "None" line 13": diff --git a/tests/rules/samples/integer_constants.out b/tests/rules/samples/integer_constants.out index 718c161..24b4b9a 100644 --- a/tests/rules/samples/integer_constants.out +++ b/tests/rules/samples/integer_constants.out @@ -15,9 +15,9 @@ integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 8": integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 9": - + integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 10": - + integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 11": integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 12": @@ -31,9 +31,9 @@ integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 16": integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 17": - + integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 18": - + integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 19": integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 20": @@ -47,9 +47,9 @@ integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 24": integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 25": - + integer_constants.c - IsVarDeclaration In "Function" from "GlobalScope" line 26": - + integer_constants.c - IsBlockEnd In "Function" from "GlobalScope" line 27": integer_constants.c - IsEmptyLine In "GlobalScope" from "None" line 28": diff --git a/tests/rules/samples/ok_func_classic.out b/tests/rules/samples/ok_func_classic.out index 9c27551..0e4b1cd 100644 --- a/tests/rules/samples/ok_func_classic.out +++ b/tests/rules/samples/ok_func_classic.out @@ -84,3 +84,4 @@ ok_func_classic.c: Error! Notice: GLOBAL_VAR_DETECTED (line: 1, col: 1): Global variable present in file. Make sure it is a reasonable choice. Error: INVALID_HEADER (line: 1, col: 1): Missing or invalid 42 header +Error: CHAR_AS_STRING (line: 48, col: 21): Character constants can have only one character diff --git a/tests/rules/samples/test_file_210223.out b/tests/rules/samples/test_file_210223.out index e1784e9..a93c258 100644 --- a/tests/rules/samples/test_file_210223.out +++ b/tests/rules/samples/test_file_210223.out @@ -3,12 +3,11 @@ test_file_210223.c - IsBlockStart In "Function" from "GlobalScope" line 2": test_file_210223.c - IsControlStatement In "Function" from "GlobalScope" line 3": - + test_file_210223.c - IsExpressionStatement In "ControlStructure" from "Function" line 5": test_file_210223.c - IsFunctionCall In "Function" from "GlobalScope" line 6": - + test_file_210223.c - IsBlockEnd In "Function" from "GlobalScope" line 8": test_file_210223.c - IsEmptyLine In "GlobalScope" from "None" line 9": @@ -25,3 +24,5 @@ test_file_210223.c: Error! Error: INVALID_HEADER (line: 1, col: 1): Missing or invalid 42 header +Error: MIXED_SPACE_TAB (line: 3, col: 61): Mixed spaces and tabs +Error: TAB_INSTEAD_SPC (line: 4, col: 1): Found tab when expecting space diff --git a/tests/test_errors.py b/tests/test_errors.py index b6333be..a65b999 100644 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -43,7 +43,7 @@ @pytest.mark.parametrize("file,test", [it.values() for it in tests]) def test_json_formatter_errored_file(file, test): lexer = Lexer(file) - context = Context(file, lexer.get_tokens()) + context = Context(file, list(lexer)) Registry().run(context) formatter = JSONErrorsFormatter(file) diff --git a/tests/test_lexer.py b/tests/test_lexer.py new file mode 100644 index 0000000..c60928a --- /dev/null +++ b/tests/test_lexer.py @@ -0,0 +1,483 @@ +from typing import Dict, Any, List, Optional + +import pytest + +from norminette.file import File +from norminette.lexer import Lexer, Token as T +from norminette.lexer.dictionary import keywords, operators, brackets +from norminette.errors import Error as E, Highlight as H +from norminette.exceptions import UnexpectedEOF + + +def lexer_from_source(source: str, /) -> Lexer: + file = File("", source) + return Lexer(file) + + +def dict_to_pytest_param(data: Dict[str, List[Any]]): + params = [] + for id, values in data.items(): + param = pytest.param(*values, id=id) + params.append(param) + return params + + +@pytest.mark.parametrize("source, parameters, expected", dict_to_pytest_param({ + "No args": ["oi", {}, 'o'], + "Empty source": ['', {}, None], + "Collect over than source length": ["hello", {"collect": 10}, "hello"], + "Collect with empty source": ['', {"collect": 3}, None], + "Offset in empty source": ['', {"offset": 3}, None], + "Offset": ["Hello", {"offset": 2}, 'l'], + "Offset with collect": ["Hello, world!", {"offset": 7, "collect": 5}, "world"], + "Offset over than source length with collect": ["Hello, world!", {"offset": 14, "collect": 3}, None], + "Newline": ["\naa", {}, '\n'], + "Escaped newline": ["\\\n", {}, '\\'], +})) +def test_lexer_raw_peek(source: str, parameters: Dict[str, Any], expected: Optional[str]): + lexer = lexer_from_source(source) + + assert lexer.raw_peek(**parameters) == expected + + +@pytest.mark.parametrize("source, parameters, expected", dict_to_pytest_param({ + "Empty source": ['', {}, None], + "One letter source": ['a', {}, 'a'], + "Two letter source": ["ab", {}, 'a'], + "Times with no source length": ['a', {"times": 2}, None], + "Times with large source length": ["abc", {"times": 2}, "ab"], + "Times with exact source length": ["abc", {"times": 3}, "abc"], + "Tab": ["\t 2", {}, '\t'], + "Tab with use_spaces": ["\t 2", {"use_spaces": True}, " "], + "Tab with use_spaces and times": ["\t 2", {"use_spaces": True, "times": 3}, " 2"], + "Tab in second column with use_spaces and times": ["ch\t2", {"use_spaces": True, "times": 4}, "ch 2"], + "Newline followed by a letter": ["\na", {}, '\n'], + "Newline with times": ["\nab", {"times": 2}, "\na"], + "Escaped newline in EOF": ["\\\n", {}, None], + "Escaped newline in SOF": ["\\\nabc", {}, 'a'], + "Escaped newline with times": ["a\\\nbc", {"times": 2}, "ab"], + "Backslash followed by a escaped newline with times": ["\\\\\nab", {"times": 2}, r"\a"], + "Escaped single quote without use_escape": [r"\'", {}, '\\'], + "Escaped single quote with use_escape": [r"\'", {"use_escape": True}, r"\'"], + "Escaped newline without use_escape": [r"\n", {}, '\\'], + "Escaped newline with use_escape": [r"\'", {"use_escape": True}, r"\'"], + "Twice escaped single quote with times but without use_escape": [r"\'\'", {"times": 4}, r"\'\'"], + "String with escaped single quote twice with times but without use_escape": ["\"\\'\\'\"", {"times": 6}, + "\"\\'\\'\""], + "String with escaped single quote twice with times and use_escape": ["\"\\'\\'\"", + {"times": 4, "use_escape": True}, + "\"\\'\\'\""], + "Char containing newline with times and use_escape": [r"'\n'", {"times": 3, "use_escape": True}, r"'\n'"], + "Use trigraph instead of backslash to escape single quote": ["??/'", {"use_escape": True}, r"\'"], + "Use trigraph to escape newline with times": ["a??/\nb", {"times": 2, "use_escape": True}, r"ab"], + "Multiples escaped newlines": ["\\\n\\\na\n", {}, 'a'], + "Multiples escaped newlines with trigraphs": ["??/\n??/\na\n", {}, 'a'], + "Bla": ["\\\na\n", {}, 'a'], +})) +def test_lexer_pop(source: str, parameters: Dict[str, Any], expected: Optional[str]): + lexer = lexer_from_source(source) + + if expected is None: + pytest.raises(UnexpectedEOF, lexer.pop, **parameters) + else: + assert lexer.pop(**parameters) == expected + + +@pytest.mark.parametrize("source, str_expected, errors", dict_to_pytest_param({ + "Unexpected EOF only quote": ['\'', "", [ + E.from_name("UNEXPECTED_EOF_CHR", highlights=[H(lineno=1, column=1, length=1)]) + ]], + "Unexpected EOF with quote and letter": ["'a", "", [ + E.from_name("UNEXPECTED_EOF_CHR", highlights=[H(lineno=1, column=1, length=2)]), + ]], + "Unexpected EOF with char as string": ["'Farofa ", "", [ + E.from_name("UNEXPECTED_EOF_CHR", highlights=[H(lineno=1, column=1, length=len("'Farofa "))]), + ]], + "Unexpected EOL only quote": ["'\n", "", [ + E.from_name("UNEXPECTED_EOL_CHR", highlights=[ + H(lineno=1, column=1, length=1), + H(lineno=1, column=1 + len("'"), length=1, hint="Perhaps you forgot a single quote (')?"), + ]), + ]], + "Unexpected EOL with quote and letter": ["'a\n", "", [ + E.from_name("UNEXPECTED_EOL_CHR", highlights=[ + H(lineno=1, column=1, length=2), + H(lineno=1, column=1 + len("'a"), length=1, hint="Perhaps you forgot a single quote (')?"), + ]), + ]], + "Unexpected EOL with char as string": ["'Astronauta\n", "", [ + E.from_name("UNEXPECTED_EOL_CHR", highlights=[ + H(lineno=1, column=1, length=len("'Astronauta")), + H(lineno=1, column=1 + len("'Astronauta"), length=1, hint="Perhaps you forgot a single quote (')?"), + ]), + ]], + "ASCII letter": ["'a'", "", []], + "ASCII number": ["'9'", "", []], + "Single quote escaped": [r"'\''", r"", []], + "Newline": [r"'\n'", r"", []], + "Empty char": ["''", "", [ + E.from_name("EMPTY_CHAR", highlights=[H(lineno=1, column=1, length=2)])], + ], + "String quote": ['"a"', "None", []], + "Int literal": ['1', "None", []], + "Null": [r"'\0'", r"", []], + "Hexadecimal char E9 (é)": [r"'\xE9'", R"", []], + "Hexadecimal char without sequence": [r"'\x'", R"", [ + E.from_name("NO_HEX_DIGITS", level="Notice", highlights=[ + H(lineno=1, column=3, length=1), + ]), + ]], + "Escape sequence that doesn't exists": [r"'\j'", r"", [ + E.from_name("UNKNOWN_ESCAPE", level="Notice", highlights=[ + H(lineno=1, column=3, length=1), + ]), + ]], + "Char too long": ["'John Galt'", "", [ + E.from_name("CHAR_AS_STRING", highlights=[ + H(lineno=1, column=1, length=len("'John Galt'")), + H(lineno=1, column=1, length=1, + hint="Perhaps you want a string (double quote, \") instead of a char (single quote, ')?"), + ]) + ]], + "Char with L prefix": ["L'a'", "", []], + "Char escaped with L prefix": [r"L'\n'", r"", []], + "Hex with one digit": [r"'\xA'", r"", []], + "Hex with two digits": [r"'\x3F'", r"", []], +})) +def test_lexer_parse_char_literal(source: str, str_expected: str, errors: List[E]): + lexer = lexer_from_source(source) + token = lexer.parse_char_literal() + + assert str(token) == str_expected + assert repr(lexer.file.errors) == repr(errors) + + +@pytest.mark.parametrize("source, str_expected, errors", dict_to_pytest_param({ + "Empty string": ["\"\"", "", []], + "ASCII normal string": ["\"x+1=2, where x=1\"", "", []], + "Single quote string": ["'teste'", "None", []], + "Unexpected EOF with empty string": ['\"', "", [ + E.from_name("UNEXPECTED_EOF_STR", highlights=[ + H(lineno=1, column=1, length=1), + H(lineno=1, column=2, length=1, hint="Perhaps you forgot a double quote (\")?"), + ]), + ]], + "Unexpected EOF": ['\"asd', "", [ + E.from_name("UNEXPECTED_EOF_STR", highlights=[ + H(lineno=1, column=1, length=4), + H(lineno=1, column=5, length=1, hint="Perhaps you forgot a double quote (\")?"), + ]), + ]], + "String with escaped new line": ["\"first\\\n second\"", "", []], + "Basic string": ["\"Basic string\"", "", []], + "L basic string": ["L\"Basic string\"", "", []], + "String with escaped quotes": ["\"Basic \\\"string\\\"\"", "", []], + "Multiples escapes and escaped quote": [r'"Escaped \\\"string\\\\\"\\"', + r'', + []], +})) +def test_lexer_parse_string_literal(source: str, str_expected: str, errors: List[E]): + lexer = lexer_from_source(source) + token = lexer.parse_string_literal() + + assert str(token) == str_expected + assert repr(lexer.file.errors) == repr(errors) + + +@pytest.mark.parametrize("source, str_expected", dict_to_pytest_param({ + "Empty comment": ["//", ""], + "Comment at EOF": ["// The sky is falling", ""], + "Comment at EOL": ["// The sky is falling\n", ""], + "Comment with escaped line in EOF": ["// The sky is falling\\", r""], + "Comment with escaped line in EOF using trigraph": [r"// The sky is falling??/", + r""], + "Comment with escaped line": ["// The sky is falling\\\n!", ""], + "Comment with escaped line using trigraph": ["// The sky is falling??/\n!", ""], +})) +def test_lexer_parse_line_comment(source: str, str_expected: str): + lexer = lexer_from_source(source) + token = lexer.parse_line_comment() + + assert str(token) == str_expected + assert lexer.file.errors.status == "OK" + + +@pytest.mark.parametrize("source, str_expected, errors", dict_to_pytest_param({ + "Multi-line comment in single line at EOF": ["/* The sky is falling*/", + "", []], + "Multi-line comment in multiples lines at EOF": ["/*\na\nb\n\n\n*/", "", []], + "Multi-line comment with escaped line": ["/*\\\na*/", "", []], + "Multi-line comment with escaped line using trigraph": ["/*??/\na*/", "", []], + "Multi-line comment not terminated with escaped line before EOF": ["/*\\\n", "", [ + E.from_name("UNEXPECTED_EOF_MC", highlights=[ + H(lineno=1, column=1, length=len("/*")), + ]), + ]], + "Multi-line comment not terminated": ["/* uepaaa\ne agora??", "", [ + E.from_name("UNEXPECTED_EOF_MC", highlights=[ + H(lineno=1, column=1, length=len("/* uepaaa\ne agora??")), + ]), + ]], + "Multi-line comment not terminate ending with a backslash": ["/*\\", r"", [ + E.from_name("UNEXPECTED_EOF_MC", highlights=[ + H(lineno=1, column=1, length=len("/*\\")), + ]), + ]], + "Comment (not multi-line)": ["// hey, i'm not a multi-line comment", "None", []], + "Space before a multi-line comment": [" /* */", "None", []], +})) +def test_lexer_parse_multi_line_comment(source: str, str_expected: str, errors: List[E]): + lexer = lexer_from_source(source) + token = lexer.parse_multi_line_comment() + + assert str(token) == str_expected + assert repr(lexer.file.errors) == repr(errors) + + +@pytest.mark.parametrize("source, str_expected, errors", dict_to_pytest_param({ + "Decimal integer": ["1234567890", "", []], + "Decimal integer with UL as suffix": ["1234567890UL", "", []], + "Decimal integer with bad suffix": ["1234567890ABC", "", [ + E.from_name("INVALID_SUFFIX", highlights=[ + H(lineno=1, column=11, length=len("ABC")), + ]), + ]], + "Binary integer": ["0b1101011", "", []], + "Binary integer with U as suffix": ["0b000001U", "", []], + "Binary integer with bad digits": ["0b1210491011", "", [ + E.from_name("INVALID_BIN_INT", highlights=[ + H(lineno=1, column=4, length=1, hint=None), # 2 + H(lineno=1, column=7, length=1, hint=None), # 4 + H(lineno=1, column=8, length=1, hint=None), # 9 + ]), + ]], + "Octal integer": ["01234567123", "", []], + "Octal integer with U as suffix": ["0123u", "", []], + "Octal integer with bad digits": ["00072189", "", [ + E.from_name("INVALID_OCT_INT", highlights=[ + H(lineno=1, column=7, length=1, hint=None), # 8 + H(lineno=1, column=8, length=1, hint=None), # 9 + ]), + ]], + "Octal integer with bad suffix with dots": ["000123u.23", "", [ + E.from_name("INVALID_SUFFIX", highlights=[ + H(lineno=1, column=len("000123") + 1, length=len("u.23")), + ]), + ]], + "Integer with u suffix": ["123u", "", []], + "Integer with U suffix": ["123U", "", []], + "Integer with uz suffix": ["123uz", "", []], + "Integer with UZ suffix": ["123UZ", "", []], + "Integer with z suffix": ["123z", "", []], + "Integer with Z suffix": ["123Z", "", []], + "Integer with ul suffix": ["123ul", "", []], + "Integer with UL suffix": ["123UL", "", []], + "Integer with ull suffix": ["123ull", "", []], + "Integer with ULL suffix": ["123ULL", "", []], + "Integer with ll suffix": ["9000000000ll", "", []], + "Integer with LL suffix": ["9000000000LL", "", []], + "Integer with bad suffix": ["10Uu", "", [ + E.from_name("INVALID_SUFFIX", highlights=[ + H(lineno=1, column=1, length=len("10")), + ]), + ]], +})) +def test_lexer_parse_integer_literal(source: str, str_expected: str, errors: List[E]): + lexer = lexer_from_source(source) + token = lexer.parse_integer_literal() + + assert str(token) == str_expected + assert repr(lexer.file.errors) == repr(errors) + + +@pytest.mark.parametrize("source, str_expected, errors", dict_to_pytest_param({ + "Integer": ["1234567890", "None", []], + "Integer with exponent-part": ["1e2", "", []], + "Integer with exponent-part and f as suffix": ["1e2f", "", []], + "Integer with bad exponent-part": ["1eeee2xf", "", [ + E.from_name("BAD_EXPONENT", highlights=[H(lineno=1, column=2, length=7)]), + ]], + "Exponent with sign": ["1e+3", "", []], + "": ["45e++ai", "None", []], + "": ["e42", "None", []], + "": ["0x1uLl;", "None", []], + "": [".0e4x;", "None", []], + "": ["10ul;", "None", []], + "": ["10lul;", "None", []], + "": ["0x1uLl;", "None", []], + "": ["0x1ULl;", "None", []], + "": ["0x1lL;", "None", []], + "": ["0x1Ll;", "None", []], + "": ["0x1UlL;", "None", []], + "Integer with bad suffix": ["10uu", "None", []], + "": ["10Uu", "None", []], + "": ["10UU", "None", []], + "": ["0b0101e", "None", []], + "": ["0b0101f", "None", []], + "": ["0b0X101f", "None", []], + "": ["0X101Uf", "None", []], + "": ["0101f", "None", []], + "": ["10.12fe10", "None", []], + "": ["10.fU", "None", []], + "": ["21.3E56E4654", "None", []], + "": ["105e4d", "None", []], + "": ["105flu", "None", []], + "": ["105fu", "None", []], + "": ["105eu", "None", []], +})) +def test_lexer_parse_float_literal(source: str, str_expected: str, errors: List[E]): + lexer = lexer_from_source(source) + token = lexer.parse_float_literal() + + assert str(token) == str_expected + assert repr(lexer.file.errors) == repr(errors) + + +@pytest.mark.parametrize("source, str_expected", dict_to_pytest_param({ + "Identifier starting with an integer": ["42_hello", "None"], + "Identifier starting with an underscore": ["_hello", ""], + "ft_printf identifier": ["ft_printf", ""], + "Identifier with just underscore": ['_', ""], + "Identifier with just one letter": ['a', ""], + "Identifier with uppercase letters": ["EGGS", ""], + "Identifier with mixedcase letters": ["AbCd", ""], + "Identifier with lowercase letters": ["duck", ""], + "Identifier with an hyphen": ["clojure-is-cool", ""], + "Identifier with integers, letters and underscores": ["ascii_2_bigint128", ""], + "String starting with an letter": ["L\"ola\"", ""], + "Char starting with an letter": ["L'1'", ""], +})) +def test_lexer_parse_identifier(source: str, str_expected: str): + lexer = lexer_from_source(source) + token = lexer.parse_identifier() + + assert str(token) == str_expected + assert lexer.file.errors.status == "OK" + + +@pytest.mark.parametrize("keyword", keywords.keys()) +def test_lexer_parse_identifier_keyword_only(keyword: str): + lexer = lexer_from_source(keyword) + token = lexer.parse_identifier() + + assert str(token) == f"<{keyword.upper()}>" + assert lexer.file.errors.status == "OK" + + +@pytest.mark.parametrize("operator, token_type", operators.items()) +def test_lexer_parse_operator(operator: str, token_type: str): + lexer = lexer_from_source(operator) + token = lexer.parse_operator() + + assert str(token) == f"<{token_type}>" + assert lexer.file.errors.status == "OK" + + +@pytest.mark.parametrize("bracket, token_type", brackets.items()) +def test_lexer_parse_brackets(bracket: str, token_type: str): + lexer = lexer_from_source(bracket) + token = lexer.parse_brackets() + + assert str(token) == f"<{token_type}>" + assert lexer.file.errors.status == "OK" + + +@pytest.mark.parametrize("source, expected_tokens", dict_to_pytest_param({ + "Empty source": ['', []], + "Just space source": [" ", [ + T("SPACE", (1, 1)), + T("SPACE", (1, 2)), + T("SPACE", (1, 3)), + ]], + "Identifier followed by a comment": ["test//comment", [ + T("IDENTIFIER", (1, 1), "test"), + T("COMMENT", (1, 5), "//comment"), + ]], + "Main function prototype with void": ["int\tmain(void);", [ + T("INT", (1, 1)), + T("TAB", (1, 4)), + T("IDENTIFIER", (1, 5), value="main"), + T("LPARENTHESIS", (1, 9)), + T("VOID", (1, 10)), + T("RPARENTHESIS", (1, 14)), + T("SEMI_COLON", (1, 15)), + ]], + # Checks if `identifier` is bellow to `char` and `string` + "Wide char/string followed by identifier": ["L'a' L\"bcd\" name", [ + T("CHAR_CONST", (1, 1), value="L'a'"), + T("SPACE", (1, 5)), + T("STRING", (1, 6), value="L\"bcd\""), + T("SPACE", (1, 12)), + T("IDENTIFIER", (1, 13), value="name"), + ]], + "Integer": ["42", [T("CONSTANT", (1, 1), value="42")]], + "Integer with plus sign": ["+42", [ + T("PLUS", (1, 1)), + T("CONSTANT", (1, 2), value="42"), + ]], + "Integer with minus sign": ["-42", [ + T("MINUS", (1, 1)), + T("CONSTANT", (1, 2), value="42"), + ]], + "Integer with double sign": ["+-42", [ + T("PLUS", (1, 1)), + T("MINUS", (1, 2)), + T("CONSTANT", (1, 3), value="42"), + ]], + "Float": ["4.2", [T("CONSTANT", (1, 1), value="4.2")]], + "Float without integer part": [".42", [T("CONSTANT", (1, 1), value=".42")]], + "Float exponential": ["4e2", [T("CONSTANT", (1, 1), value="4e2")]], + "Float with exponential in fractional part without integer": [".4e2", [T("CONSTANT", (1, 1), value=".4e2")]], + "Float exponential with suffix": ["4e2f", [T("CONSTANT", (1, 1), value="4e2f")]], + "Float exponential in fractional part with suffix": [".4e2f", [T("CONSTANT", (1, 1), value=".4e2f")]], + "Octal": ["042", [T("CONSTANT", (1, 1), value="042")]], + "Hexadecimal": ["0x42", [T("CONSTANT", (1, 1), value="0x42")]], + "Negative hexadecimal": ["-0x4e2", [ + T("MINUS", (1, 1)), + T("CONSTANT", (1, 2), value="0x4e2"), + ]], + "Integer with l as suffix": ["42l", [T("CONSTANT", (1, 1), value="42l")]], + "Integer with ul as suffix": ["42ul", [T("CONSTANT", (1, 1), value="42ul")]], + "Integer with ll as suffix": ["42ll", [T("CONSTANT", (1, 1), value="42ll")]], + "Integer with ull as suffix": ["42ull", [T("CONSTANT", (1, 1), value="42ull")]], + "Integer with u suffix": ["42u", [T("CONSTANT", (1, 1), value="42u")]], + "Multiples signs": ["-+-+-+-+-+-+-+-0Xe4Ae2", [ + T("MINUS", (1, 1)), + T("PLUS", (1, 2)), + T("MINUS", (1, 3)), + T("PLUS", (1, 4)), + T("MINUS", (1, 5)), + T("PLUS", (1, 6)), + T("MINUS", (1, 7)), + T("PLUS", (1, 8)), + T("MINUS", (1, 9)), + T("PLUS", (1, 10)), + T("MINUS", (1, 11)), + T("PLUS", (1, 12)), + T("MINUS", (1, 13)), + T("PLUS", (1, 14)), + T("MINUS", (1, 15)), + T("CONSTANT", (1, 16), value="0Xe4Ae2"), + ]], + "Member expression with left part": [".e42", [ + T("DOT", (1, 1)), + T("IDENTIFIER", (1, 2), value="e42") + ]], + "Multiples dots in float": ["4.4.4", [T("CONSTANT", (1, 1), value="4.4.4")]], + "Multiples exponents": ["4e4e4", [T("CONSTANT", (1, 1), value="4e4e4")]], + "Bad suffix 1": ["4x4x4", [T("CONSTANT", (1, 1), value="4x4x4")]], + "Bad suffix 2": ["42uul", [T("CONSTANT", (1, 1), value="42uul")]], + "Bad suffix 3": ["42Lllu", [T("CONSTANT", (1, 1), value="42Lllu")]], + "Bad suffix 4": ["42lul", [T("CONSTANT", (1, 1), value="42lul")]], + "Bad exponent": [".42e", [T("CONSTANT", (1, 1), ".42e")]], + "Escaped newline followed by an identifier": ["\\\nhello;", [ + T("IDENTIFIER", (2, 1), value="hello"), + T("SEMI_COLON", (2, 6)), + ]] + # TODO Add to check prepoc tokens +})) +def test_lexer_tokens(source: str, expected_tokens: List[T]): + lexer = lexer_from_source(source) + tokens = list(lexer) + + assert tokens == expected_tokens diff --git a/tests/tokenizer/token_errors_test.py b/tests/tokenizer/token_errors_test.py deleted file mode 100644 index e19bda1..0000000 --- a/tests/tokenizer/token_errors_test.py +++ /dev/null @@ -1,49 +0,0 @@ -import pytest - -from norminette.file import File -from norminette.lexer import Lexer, TokenError - - -failed_tokens_tests = [ - {"text": "\tdouble f=45e++ai", "line": 1, "pos": 14}, - {"text": '\tchar *b = "e42\n\n', "line": 1, "pos": 15}, - {"text": "int\t\t\tn\t= 0x1uLl;", "line": 1, "pos": 19}, - {"text": 'char\t\t\t*yo\t\t\t= "', "line": 1, "pos": 31}, - {"text": "{return 1;}\\\\\\n", "line": 1, "pos": 12}, - {"text": "int a = a+++++a;\ndouble b = .0e4x;", "line": 2, "pos": 12}, - {"text": "int a = 1;\nint b = 10ul;\nint c = 10lul;\n", "line": 3, "pos": 9}, - {"text": "int number = 0x1uLl;", "line": 1, "pos": 14}, - {"text": "int number = 0x1ULl;", "line": 1, "pos": 14}, - {"text": "int number = 0x1lL;", "line": 1, "pos": 14}, - {"text": "int number = 0x1Ll;", "line": 1, "pos": 14}, - {"text": "int number = 0x1UlL;", "line": 1, "pos": 14}, - {"text": "int number = 10ullll", "line": 1, "pos": 14}, - {"text": "int number = 10lul", "line": 1, "pos": 14}, - {"text": "int number = 10lUl", "line": 1, "pos": 14}, - {"text": "int number = 10LUl", "line": 1, "pos": 14}, - {"text": "int number = 10uu", "line": 1, "pos": 14}, - {"text": "int number = 10Uu", "line": 1, "pos": 14}, - {"text": "int number = 10UU", "line": 1, "pos": 14}, - {"text": "int number = 0b0101e", "line": 1, "pos": 14}, - {"text": "int number = 0b0101f", "line": 1, "pos": 14}, - {"text": "int number = 0b0X101f", "line": 1, "pos": 14}, - {"text": "int number = 0X101Uf", "line": 1, "pos": 14}, - {"text": "int number = 0101f", "line": 1, "pos": 14}, - {"text": "float number=10.12fe10", "line": 1, "pos": 14}, - {"text": "float number=10.fU", "line": 1, "pos": 14}, - {"text": "float number=21.3E56E4654", "line": 1, "pos": 14}, - {"text": "float number=105e4d", "line": 1, "pos": 14}, - {"text": "float number=105flu", "line": 1, "pos": 14}, - {"text": "float number=105fu", "line": 1, "pos": 14}, - {"text": "float number=105eu", "line": 1, "pos": 14}, -] - - -@pytest.mark.parametrize( - "data", failed_tokens_tests, ids=[data["text"] for data in failed_tokens_tests] -) -def test_tokenizing_errors(data): - text, line, pos = data.values() - - with pytest.raises(TokenError, match=f"({line}, {pos})"): - Lexer(File("", text)).check_tokens() diff --git a/tests/tokenizer/token_generator_test.py b/tests/tokenizer/token_generator_test.py index 4061f92..67d9123 100644 --- a/tests/tokenizer/token_generator_test.py +++ b/tests/tokenizer/token_generator_test.py @@ -15,6 +15,14 @@ def test_rule_for_file(file): with open(f"{file.split('.')[0]}.tokens") as out_file: out_content = out_file.read() - output = Lexer(File(file)).check_tokens() + lexer = Lexer(File(file)) + + output = '' + tokens = list(lexer) + if tokens: + for token in tokens: + output += str(token) + '\n' * int(token.type == "NEWLINE") + if tokens[-1].type != "NEWLINE": + output += "\n" assert output == out_content