/// AUTOGENERATED COPYRIGHT HEADER START // Copyright (C) 2017-2024 Michael Fabian 'Xaymar' Dirks // AUTOGENERATED COPYRIGHT HEADER END #include "lexer.hpp" #include #include #include std::string format(const char* format, ...) { va_list arg1; va_list arg2; va_start(arg1, format); va_copy(arg2, arg1); int length = vsnprintf(nullptr, 0, format, arg1); std::vector buffer(length + 1); vsnprintf(buffer.data(), buffer.size(), format, arg2); va_end(arg1); va_end(arg2); return {buffer.data(), buffer.data() + length}; } std::string blitz::token::to_string() { std::string name; switch (type) { case variant::UNKNOWN: name = "Unknown"; break; case variant::ENDOFFILE: name = "EndOfFile"; break; case variant::NEWLINE: name = "NewLine"; break; case variant::CONTROL: name = "Control"; break; case variant::COMMENT: name = "Comment"; break; case variant::TEXT: name = "Text"; break; case variant::STRING: name = "String"; break; case variant::INTEGER: name = "Integer"; break; case variant::REAL: name = "Real"; break; case variant::SYMBOL: name = "Symbol"; break; default: name = "How the fuck?!"; break; } if (type == variant::NEWLINE || type == variant::CONTROL) { return format("%s(%llu@%llu, %d)", name.c_str(), location.first, location.second, text[0]); } else { return format("%s(%llu@%llu, %s)", name.c_str(), location.first, location.second, text.c_str()); } } blitz::lexer::~lexer() {} blitz::lexer::lexer(std::filesystem::path file) { // Usually files start at line 1 and character 1, so we should start there too. _location = {1, 1}; // Try and open the file for reading. _file = file; _stream = std::ifstream(_file, std::ios_base::binary); // We use binary so we can eventually support UTF-8. if (!_stream.good() || _stream.eof() || _stream.bad() || _stream.fail()) { throw std::runtime_error(format("Reading file '%s' failed.", file.generic_string().c_str())); } // Initialize token storage to a default token. _override = _current = blitz::token{ .location = {0, 0}, .text = "", .type = token::variant::UNKNOWN, }; } blitz::token blitz::lexer::current() { return _current; } blitz::token blitz::lexer::next() { enum class stage { DEFAULT, TEXT, NUMBER, STRING, COMMENT, } state = stage::DEFAULT; std::stringstream buffer; blitz::token token{ .location = _location, .text = "", .type = blitz::token::variant::UNKNOWN, }; auto issymbol = [](int chr) { switch (chr) { case ';': // Comment case ':': // Command Separator case '=': // Equal case '<': // Less Than case '>': // Greater Than case '~': // Bitwise Not case '^': // Exponential (X ^ Y = pow(X, Y)) case '+': // Plus case '-': // Minus case '*': // Multiply case '/': // Divide case ',': // Parameter Separation case '%': // Integer Type case '#': // Real Type case '$': // String Type case '.': // Structured Type case '\\': // Structured Type Access // Blitz Arrays case '[': case ']': // Call, Grouping, Dim case '(': case ')': return true; default: return false; } return false; }; auto iswhitespace = [](int chr) { switch (chr) { case ' ': case '\t': return true; default: return false; } return false; }; // ToDo: Figure out why we don't ever hit chr == EOF. if (_stream.eof()) { token.location = _location; token.type = blitz::token::variant::ENDOFFILE; return token; } bool complete = false; while (!complete && _stream.good()) { // Peek at the current byte, without advancing the read pointer until we need to. auto chr = _stream.peek(); bool is_newline = (chr == '\r') || (chr == '\n'); bool is_returnfeed = (chr == '\r'); if (state == stage::DEFAULT) { if (chr == EOF) { token.type = blitz::token::variant::ENDOFFILE; token.text = ""; token.location = _location; complete = true; _location.second++; } else if (is_newline) { // New Line, should be handled like a control character, but with some special things. token.type = blitz::token::variant::NEWLINE; token.text = "\n"; token.location = _location; complete = true; // Advance the read pointer. _stream.get(); // Is this a Windows-style \r\n? if (is_returnfeed && (_stream.peek() == '\n')) { // If so, advance the read pointer again. _stream.get(); } // Then update the location. _location.first++; _location.second = 1; } else if (iswhitespace(chr)) { // This is white space, which we'll happily ignore. _stream.get(); _location.second++; } else if (chr < 32) { // Likely to be a control character. token.location = _location; token.type = blitz::token::variant::CONTROL; token.text = {1, char(chr)}; complete = true; _stream.get(); _location.second++; } else if (chr == ':') { // Allows code writers to pretend it's all one line. token.location = _location; token.type = blitz::token::variant::SEPARATOR; token.text = {1, char(chr)}; complete = true; _stream.get(); _location.second++; } else if (chr == ';') { // A comment, which ends at the next new line. state = stage::COMMENT; token.location = _location; token.type = blitz::token::variant::COMMENT; } else if (isdigit(chr)) { // Probably an Integer, or if the latter, it's a Real. state = stage::NUMBER; token.location = _location; token.type = blitz::token::variant::INTEGER; } else if (isalpha(chr)) { // Text of some kind. state = stage::TEXT; token.location = _location; token.type = blitz::token::variant::TEXT; } else if (chr == '"') { // A quoted string. state = stage::STRING; token.location = _location; token.type = blitz::token::variant::STRING; // Advance so we actually get anywhere. _stream.get(); _location.second++; } else if (issymbol(chr)) { // Special Handling for a few symbols that could mean multiple things. if (chr == '.') { // '.' can start a Real, Label or Structured Type Access. We don't want to decide on the latter here, that's a parser thing. buffer << (char)chr; // We advance the read pointer here to look at what's coming next. _stream.get(); chr = _stream.peek(); _location.second++; // Peek at what's coming next. if (isdigit(chr)) { // This is a Real number. token.location = _location; token.type = blitz::token::variant::REAL; state = stage::NUMBER; } else { // Assume this is a symbol and return to normal behavior. token.location = _location; token.text = buffer.str(); token.type = blitz::token::variant::SYMBOL; complete = true; } } else if ((chr == '+') || (chr == '-')) { // '+' & '-' could be prefixes to an Integer or Real. buffer << (char)chr; // Advance the read pointer to peek at the future. _stream.get(); chr = _stream.peek(); _location.second++; // Peek at what's coming up. if (isdigit(chr) || (chr == '.')) { // Likely to be a Real or Integer. token.location = _location; if (chr == '.') { token.type = blitz::token::variant::REAL; } else { token.type = blitz::token::variant::INTEGER; } state = stage::NUMBER; } else { token.location = _location; token.text = buffer.str(); token.type = blitz::token::variant::SYMBOL; complete = true; } } else { token.location = _location; token.text = {1, char(chr)}; token.type = blitz::token::variant::SYMBOL; complete = true; // Advance so we actually get anywhere. _stream.get(); _location.second++; } } else { // Everything else is an error throw blitz::error(_file, _location, _location, "You've encountered a bug. Please report this with the file that caused it."); } } else if (state == stage::NUMBER) { if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || (chr == ';')) { // EOF, Control, NL, Whitespace, and Comments should return to default parsing. complete = true; } else if (isdigit(chr) || (chr == '.')) { _stream.get(); buffer << (char)chr; if (chr == '.') { if (token.type != token::variant::REAL) { token.type = blitz::token::variant::REAL; } else { token.text = buffer.str(); throw blitz::error(_file, token.location, _location, format("In token %s: Expected [0-9], got '%s' instead.", token.to_string().c_str(), std::string{1, (char)chr}.c_str())); } } } else if (issymbol(chr)) { complete = true; } else { token.text = buffer.str(); throw blitz::error(_file, token.location, _location, format("In token %s: Expected [0-9.], got '%s' instead.", token.to_string().c_str(), std::string{1, (char)chr}.c_str())); } if (complete) { token.text = buffer.str(); } } else if (state == stage::TEXT) { if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || issymbol(chr)) { // Return to default parsing. complete = true; } else if (isalpha(chr) || isdigit(chr) || (chr == '_')) { buffer << (char)chr; _stream.get(); _location.second++; } else { token.text = buffer.str(); throw blitz::error(_file, token.location, _location, format("In token %s: Expected [a-zA-Z0-9_], got '%s' instead.", token.to_string().c_str(), std::string{1, (char)chr}.c_str())); } if (complete) { token.text = buffer.str(); } } else if (state == stage::STRING) { if ((chr == EOF) || (chr < 32) || is_newline) { // Return to default parsing. complete = true; } else if (chr == '"') { // The only true way to end a string. complete = true; // Skip over the " so we don't confuse the parser. _stream.get(); _location.second++; } else { buffer << (char)chr; _stream.get(); _location.second++; } if (complete) { token.text = buffer.str(); } } else if (state == stage::COMMENT) { if ((chr == EOF) || (chr < 32) || is_newline) { // Return to default parsing at this point. complete = true; } else { buffer << (char)chr; _stream.get(); _location.second++; } if (complete) { token.text = buffer.str(); } } } _current = token; return _current; } /* std::pair blitz::lexer::current() { return _current; } std::pair blitz::lexer::next(std::istream& fs) { std::stringstream buffer; blitz::tokentype token; enum class parserState { DEFAULT, TEXT, NUMBER, STRING, COMMENT, } state = parserState::DEFAULT; while ((token == blitz::tokentype::TokenUnknown) && !fs.eof() && fs.good()) { auto chr = fs.get(); } } /* std::pair blitz::lexer::next(std::shared_ptr fs) { std::string buf; token tkn = token::TokenUnknown; bool haveResult = false; // Allow "overriding" the next retrieved Token. if (m_overrideToken != token::TokenUnknown) { buf = m_overrideText; tkn = m_overrideToken; m_overrideToken = token::TokenUnknown; haveResult = true; } bool m_isTextMode = false; bool m_isNumberMode = false; bool m_isStringMode = false; bool m_isCommentMode = false; bool m_numberModeHasDecimal = false; while (((fs->eof() == false) && (fs->good())) && !haveResult) { char chr = fs->get(); if (chr == '\r' || chr == '\n') { if (tkn != token::TokenEOF) { m_overrideToken = token::TokenNewLine; m_overrideText = ""; } else { tkn = token::TokenNewLine; buf = ""; } m_isStringMode = false; m_isNumberMode = false; m_isTextMode = false; m_isCommentMode = false; break; } else if (m_isStringMode) { if (chr == '\"') { m_overrideToken = token::TokenDoubleQuote; m_overrideText = chr; m_isStringMode = false; tkn = token::TokenQuotedText; break; } else if (iscntrl(chr) || !isprint(chr)) { fs->putback(chr); m_isStringMode = false; break; } else { buf += chr; } } else if (m_isTextMode) { if (isalnum(chr) || (chr == '_')) { buf += chr; } else { fs->putback(chr); m_isTextMode = false; break; } } else if (m_isNumberMode) { if (isdigit(chr)) { buf += chr; } else if (chr == '.') { if (m_numberModeHasDecimal == false) { m_numberModeHasDecimal = true; tkn = token::TokenDecimal; buf += chr; } else { fs->putback(chr); m_isNumberMode = false; break; } } else { fs->putback(chr); m_isNumberMode = false; break; } } else if (m_isCommentMode) { buf += chr; tkn = token::TokenComment; } else { // Whitespace if (isspace(chr)) continue; // Control Code if (iscntrl(chr)) { tkn = token::TokenUnknown; buf = chr; } // Special handling for + and -, due to numbers and decimals. if (chr == '+' || chr == '-') { char chr2 = fs->get(); if (isdigit(chr2)) { m_isNumberMode = true; m_numberModeHasDecimal = false; tkn = token::TokenNumber; buf = chr + chr2; break; } else if (chr2 == '.') { m_isNumberMode = true; m_numberModeHasDecimal = true; tkn = token::TokenDecimal; buf = chr + "0" + chr2; break; } else { fs->putback(chr2); } } // Symbol for (auto v : g_symbolCharacters) { if (v.first == chr) { tkn = v.second; buf = v.first; break; } } if (tkn != token::TokenEOF) { haveResult = true; break; } // Strings, Text, Numbers if (chr == ';') { m_isCommentMode = true; tkn = token::TokenSemicolon; buf = chr; break; } else if (chr == '\"') { m_isStringMode = true; tkn = token::TokenDoubleQuote; buf = chr; break; } else if (isalpha(chr)) { m_isTextMode = true; tkn = token::TokenText; buf = chr; } else if (isdigit(chr)) { m_isNumberMode = true; m_numberModeHasDecimal = false; tkn = token::TokenNumber; buf = chr; } else if (chr == '.') { m_isNumberMode = true; m_numberModeHasDecimal = true; tkn = token::TokenDecimal; buf = "0" + chr; } else { tkn = token::TokenUnknown; buf = chr; break; } } } // Convert from Text into native Token. if (tkn == token::TokenText) tkn = to_token(tkn, buf); return std::make_pair(tkn, buf); } blitz::lexer::token blitz::lexer::to_token(token in, std::string text) { static std::pair l_textToTokenList[] = { // Binary { "not", token::TokenNot }, { "and", token::TokenAnd }, { "or", token::TokenOr }, { "xor", token::TokenXor }, { "shl", token::TokenShl }, { "shr", token::TokenShr }, { "sal", token::TokenSal }, { "sar", token::TokenSar }, { "false", token::TokenFalse }, { "true", token::TokenTrue }, // Conversion { "float", token::TokenFloat }, { "string", token::TokenString }, { "hex", token::TokenHex }, { "int", token::TokenInt }, // Control { "if", token::TokenIf }, { "then", token::TokenThen }, { "elseif", token::TokenElseIf }, { "else", token::TokenElse }, { "endif", token::TokenEndIf }, { "select", token::TokenSelect }, { "case", token::TokenCase }, { "default", token::TokenDefault }, { "goto", token::TokenGoto }, { "gosub", token::TokenGosub }, { "return", token::TokenReturn }, { "function", token::TokenFunction }, { "end", token::TokenEnd }, { "stop", token::TokenStop }, // Loop { "for", token::TokenFor }, { "to", token::TokenTo }, { "next", token::TokenNext }, { "while", token::TokenWhile }, { "wend", token::TokenWend }, { "repeat", token::TokenRepeat }, { "until", token::TokenUntil }, { "forever", token::TokenForever }, { "exit", token::TokenExit }, // Math { "abs", token::TokenAbs }, { "sign", token::TokenSign }, { "cos", token::TokenCos }, { "sin", token::TokenSin }, { "tan", token::TokenTan }, { "acos", token::TokenACos }, { "asin", token::TokenASin }, { "atan", token::TokenATan }, { "atan2", token::TokenATan2 }, { "log", token::TokenLog }, { "log10", token::TokenLog10 }, { "ceil", token::TokenCeil }, { "floor", token::TokenFloor }, { "mod", token::TokenMod }, { "pi", token::TokenPi }, { "exp", token::TokenExp }, { "sqr", token::TokenSqr }, // Variables { "const", token::TokenConst }, { "global", token::TokenGlobal }, { "local", token::TokenLocal }, // Includes { "include", token::TokenInclude }, }; for (auto v : l_textToTokenList) { if (stricmp(text.c_str(), v.first)) { return v.second; } } return in; } */