/// AUTOGENERATED COPYRIGHT HEADER START // Copyright (C) 2017-2025 Michael Fabian 'Xaymar' Dirks // AUTOGENERATED COPYRIGHT HEADER END #include "lexer.hpp" #include #include #include #include "util.hpp" std::string blitz::token::to_string() { std::string name; switch (type) { case variant::UNKNOWN: name = "Unknown"; break; case variant::ENDOFFILE: name = "EndOfFile"; break; case variant::NEWLINE: name = "NewLine"; break; case variant::CONTROL: name = "Control"; break; case variant::COMMENT: name = "Comment"; break; case variant::TEXT: name = "Text"; break; case variant::STRING: name = "String"; break; case variant::INTEGER: name = "Integer"; break; case variant::REAL: name = "Real"; break; case variant::SYMBOL: name = "Symbol"; break; default: name = "Invalid"; break; } if (type == variant::NEWLINE || type == variant::CONTROL) { return blitz::format("%s(%llu@%llu, %d)", name.c_str(), location.first, location.second, text[0]); } else { return blitz::format("%s(%llu@%llu, %s)", name.c_str(), location.first, location.second, text.c_str()); } } bool blitz::token::operator==(variant rhs) { return type == rhs; } bool blitz::token::operator==(std::string const& rhs) { return text == rhs; } blitz::lexer::~lexer() {} blitz::lexer::lexer(std::filesystem::path file) { // Usually files start at line 1 and character 1, so we should start there too. _location = { 1, 1 }; // Try and open the file for reading. _file = file; _stream = std::ifstream(_file, std::ios_base::binary); // We use binary so we can eventually support UTF-8. if (!_stream.good() || _stream.eof() || _stream.bad() || _stream.fail()) { throw std::runtime_error(blitz::format("Reading file '%s' failed.", file.generic_string().c_str())); } // Initialize token storage to a default token. _next = _current = blitz::token{ .location = { 0, 0 }, .text = "", .type = token::variant::NONE, }; } blitz::token blitz::lexer::current() { return _current; } blitz::token blitz::lexer::next() { _current = peek(); _next = blitz::token{ .location = { 0, 0 }, .text = "", .type = token::variant::NONE, }; return _current; } blitz::token blitz::lexer::peek() { if (_next.type == blitz::token::variant::NONE) { // ToDo: Optimize enum class stage { DEFAULT, TEXT, NUMBER, STRING, COMMENT, } state = stage::DEFAULT; std::stringstream buffer; blitz::token token{ .location = _location, .text = "", .type = blitz::token::variant::UNKNOWN, }; auto issymbol = [](int chr) { return blitz::utility::is_symbol(chr); }; auto iswhitespace = [](int chr) { return blitz::utility::is_white_space(chr); }; // ToDo: Figure out why we don't ever hit chr == EOF. if (_stream.eof()) { token.location = _location; token.type = blitz::token::variant::ENDOFFILE; return token; } bool complete = false; while (!complete && _stream.good()) { // Peek at the current byte, without advancing the read pointer until we need to. auto chr = _stream.peek(); bool is_newline = (chr == '\r') || (chr == '\n'); bool is_returnfeed = (chr == '\r'); if (state == stage::DEFAULT) { if (chr == EOF) { token.type = blitz::token::variant::ENDOFFILE; token.text = ""; token.location = _location; complete = true; _location.second++; } else if (is_newline) { // New Line, should be handled like a control character, but with some special things. token.type = blitz::token::variant::NEWLINE; token.text = "\n"; token.location = _location; complete = true; // Advance the read pointer. _stream.get(); // Is this a Windows-style \r\n? if (is_returnfeed && (_stream.peek() == '\n')) { // If so, advance the read pointer again. _stream.get(); } // Then update the location. _location.first++; _location.second = 1; } else if (iswhitespace(chr)) { // This is white space, which we'll happily ignore. _stream.get(); _location.second++; } else if (chr < 32) { // Likely to be a control character. token.location = _location; token.type = blitz::token::variant::CONTROL; token.text = { 1, char(chr) }; complete = true; _stream.get(); _location.second++; /*} else if (chr == ':') { // Allows code writers to pretend it's all one line. token.location = _location; token.type = blitz::token::variant::SEPARATOR; token.text = {1, char(chr)}; complete = true; _stream.get(); _location.second++;*/ } else if (chr == ';') { // A comment, which ends at the next new line. state = stage::COMMENT; token.location = _location; token.type = blitz::token::variant::COMMENT; } else if (isdigit(chr)) { // Probably an Integer, or if the latter, it's a Real. state = stage::NUMBER; token.location = _location; token.type = blitz::token::variant::INTEGER; } else if (isalpha(chr)) { // Text of some kind. state = stage::TEXT; token.location = _location; token.type = blitz::token::variant::TEXT; } else if (chr == '"') { // A quoted string. state = stage::STRING; token.location = _location; token.type = blitz::token::variant::STRING; // Advance so we actually get anywhere. _stream.get(); _location.second++; } else if (issymbol(chr)) { // Special Handling for a few symbols that could mean multiple things. if (chr == '.') { // '.' can start a Real, Label or Structured Type Access. We don't want to decide on the latter here, that's a parser thing. buffer << (char)chr; // We advance the read pointer here to look at what's coming next. _stream.get(); chr = _stream.peek(); _location.second++; // Peek at what's coming next. if (isdigit(chr)) { // This is a Real number. token.location = _location; token.type = blitz::token::variant::REAL; state = stage::NUMBER; } else { // Assume this is a symbol and return to normal behavior. token.location = _location; token.text = buffer.str(); token.type = blitz::token::variant::SYMBOL; complete = true; } } else if ((chr == '+') || (chr == '-')) { // '+' & '-' could be prefixes to an Integer or Real. buffer << (char)chr; // Advance the read pointer to peek at the future. _stream.get(); chr = _stream.peek(); _location.second++; // Peek at what's coming up. if (isdigit(chr) || (chr == '.')) { // Likely to be a Real or Integer. token.location = _location; if (chr == '.') { token.type = blitz::token::variant::REAL; } else { token.type = blitz::token::variant::INTEGER; } state = stage::NUMBER; } else { token.location = _location; token.text = buffer.str(); token.type = blitz::token::variant::SYMBOL; complete = true; } } else { token.location = _location; token.text = { char(chr) }; token.type = blitz::token::variant::SYMBOL; complete = true; // Advance so we actually get anywhere. _stream.get(); _location.second++; } } else { // Everything else is an error throw blitz::error(_file, _location, _location, "You've encountered a bug. Please report this with the file that caused it."); } } else if (state == stage::NUMBER) { if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || (chr == ';')) { // EOF, Control, NL, Whitespace, and Comments should return to default parsing. complete = true; } else if (chr == 'f') { _stream.get(); token.type = blitz::token::variant::REAL; complete = true; } else if (chr == 'u') { _stream.get(); buffer << (char)chr; token.type = blitz::token::variant::INTEGER; complete = true; } else if ((chr == 'b') || (chr == 'x')) { _stream.get(); buffer << (char)chr; if (buffer.tellp() > 2) { throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected [0-9], got '%s' instead.", token.to_string().c_str(), std::string{ 1, (char)chr }.c_str())); } } else if (isdigit(chr) || (chr == '.')) { _stream.get(); buffer << (char)chr; if (chr == '.') { if (token.type != token::variant::REAL) { token.type = blitz::token::variant::REAL; } else { token.text = buffer.str(); throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected [0-9], got '%s' instead.", token.to_string().c_str(), std::string{ 1, (char)chr }.c_str())); } } } else if (issymbol(chr)) { complete = true; } else { token.text = buffer.str(); throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected ([0](b|x|))[0-9.], got '%s' instead.", token.to_string().c_str(), std::string{ 1, (char)chr }.c_str())); } if (complete) { token.text = buffer.str(); } } else if (state == stage::TEXT) { if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || issymbol(chr)) { // Return to default parsing. complete = true; } else if (isalpha(chr) || isdigit(chr) || (chr == '_')) { buffer << (char)chr; _stream.get(); _location.second++; } else { token.text = buffer.str(); throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected [a-zA-Z0-9_], got '%s' instead.", token.to_string().c_str(), std::string{ 1, (char)chr }.c_str())); } if (complete) { token.text = buffer.str(); } } else if (state == stage::STRING) { if ((chr == EOF) || (chr < 32) || is_newline) { // Return to default parsing. complete = true; } else if (chr == '"') { // The only true way to end a string. complete = true; // Skip over the " so we don't confuse the parser. _stream.get(); _location.second++; } else { buffer << (char)chr; _stream.get(); _location.second++; } if (complete) { token.text = buffer.str(); } } else if (state == stage::COMMENT) { if ((chr == EOF) || (chr < 32) || is_newline) { // Return to default parsing at this point. complete = true; } else { buffer << (char)chr; _stream.get(); _location.second++; } if (complete) { token.text = buffer.str(); } } } _next = token; } return _next; } std::filesystem::path blitz::lexer::file() { return std::filesystem::path(_file); }