Latest stuff, rewriting lexer

This commit is contained in:
Michael Fabian 'Xaymar' Dirks
2024-06-25 18:59:15 +02:00
parent 7f669f55e2
commit fa81c2a7fa
23 changed files with 1263 additions and 310 deletions
+234 -112
View File
@@ -1,49 +1,170 @@
/// AUTOGENERATED COPYRIGHT HEADER START
// Copyright (C) 2017-2024 Michael Fabian 'Xaymar' Dirks <info@xaymar.com>
// AUTOGENERATED COPYRIGHT HEADER END
#include "lexer.hpp"
#include <codecvt>
#include <sstream>
std::pair<char, blitz::Lexer::Token> g_symbolCharacters[] = {
/*std::pair<char, blitz::tokentype> g_symbolCharacters[] = {
//{ '\"', BlitzLLVM::Lexer::Token::TokenDoubleQuote }, // Has special meaning.
{ '+', blitz::Lexer::Token::TokenPlus },
{ '-', blitz::Lexer::Token::TokenMinus },
{ '/', blitz::Lexer::Token::TokenSlashForward },
{ '\\', blitz::Lexer::Token::TokenSlashBackward },
{ '*', blitz::Lexer::Token::TokenMultiply },
{ '=', blitz::Lexer::Token::TokenEqual },
{ '#', blitz::Lexer::Token::TokenOctothorp },
{ '%', blitz::Lexer::Token::TokenPercent },
{ '$', blitz::Lexer::Token::TokenDollar },
{ '(', blitz::Lexer::Token::TokenRoundBracketOpen },
{ ')', blitz::Lexer::Token::TokenRoundBracketClose },
{ '[', blitz::Lexer::Token::TokenSquareBracketOpen },
{ ']', blitz::Lexer::Token::TokenSquareBracketClose },
{ '<', blitz::Lexer::Token::TokenAngleBracketOpen },
{ '>', blitz::Lexer::Token::TokenAngleBracketClose },
//{ '.', BlitzLLVM::Lexer::Token::TokenDot }, // Special meaning.
{ ':', blitz::Lexer::Token::TokenColon },
{ ',', blitz::Lexer::Token::TokenComma },
//{ ';', BlitzLLVM::Lexer::Token::TokenSemicolon },
{ '^', blitz::Lexer::Token::TokenCaret },
{ '~', blitz::Lexer::Token::TokenBitNot },
};
{ '+', blitz::tokentype::TokenPlus },
{ '-', blitz::tokentype::TokenMinus },
{ '/', blitz:::tokentype::TokenSlashForward },
{ '\\', blitz::tokentype::TokenSlashBackward },
{ '*', blitz::tokentype::TokenMultiply },
{ '=', blitz::tokentype::TokenEqual },
{ '#', blitz::tokentype::TokenOctothorp },
{ '%', blitz::tokentype::TokenPercent },
{ '$', blitz::tokentype::TokenDollar },
{ '(', blitz::tokentype::TokenRoundBracketOpen },
{ ')', blitz::tokentype::TokenRoundBracketClose },
{ '[', blitz::tokentype::TokenSquareBracketOpen },
{ ']', blitz::tokentype::TokenSquareBracketClose },
{ '<', blitz::tokentype::TokenAngleBracketOpen },
{ '>', blitz::tokentype::TokenAngleBracketClose },
//{ '.', BlitzLLVM::Token::TokenDot }, // Special meaning.
{ ':', blitz::tokentype::TokenColon },
{ ',', blitz::tokentype::TokenComma },
//{ ';', BlitzLLVM::Token::TokenSemicolon },
{ '^', blitz::tokentype::TokenCaret },
{ '~', blitz::tokentype::TokenBitNot },
};*/
blitz::Lexer::Lexer() {}
blitz::lexer::~lexer() {}
blitz::Lexer::~Lexer() {}
blitz::lexer::lexer(std::filesystem::path file)
{
// Usually files start at line and character 0, so we should start there too.
_line = _character = 0;
std::pair<blitz::Lexer::Token, std::string> blitz::Lexer::GetCurrentToken() {
return std::make_pair(m_currentToken, m_currentText);
// Try and open the file for reading.
_file = file;
_stream = std::ifstream(_file, std::ios_base::binary); // We use binary so we can eventually support UTF-8.
if (!_stream.good() || _stream.eof() || _stream.bad() || _stream.fail()) {
char buffer[16384];
int len = snprintf(buffer, sizeof(buffer), "Reading file '%s' failed.\0", file.generic_string().c_str());
throw std::runtime_error(std::string(buffer, buffer + len));
}
// Initialize token storage to a default token.
_override = _current = blitz::token{
.line = 0,
.character = 0,
.text = "",
.type = token::variant::UNKNOWN,
};
}
std::pair<blitz::Lexer::Token, std::string> blitz::Lexer::GetNextToken(std::shared_ptr<std::istream> fs) {
blitz::token blitz::lexer::current()
{
return _current;
}
blitz::token blitz::lexer::next()
{
enum class stage {
DEFAULT,
TEXT,
NUMBER,
STRING,
COMMENT,
} state = stage::DEFAULT;
bool numberHasDot = false;
std::stringstream buffer;
blitz::token token{
.line = _line,
.character = _character,
.text = "",
.type = blitz::token::variant::UNKNOWN,
};
// Helper function to advance text.
auto advance = [this]() {
_character++;
return _stream.get();
};
while ((token.type == blitz::token::variant::UNKNOWN) && _stream.good() && !_stream.eof()) {
auto chr = advance();
if (state == stage::DEFAULT) {
if (chr == ';') { // We've encountered a comment, so we should change state and ignore this symbol.
state = stage::COMMENT;
token.line = _line;
token.character = _character;
token.type = blitz::token::variant::COMMENT;
} else {
buffer << chr;
}
} else if (state == stage::NUMBER) {
if (isdigit(chr)) {
buffer << chr;
} else if (chr == '.')
{
if (numberHasDot) {
throw std::runtime_error("")
}
numberHasDot = true;
} else {
}
} else if (state == stage::TEXT) {
} else if (state == stage::STRING) {
} else if (state == stage::COMMENT) {
if (chr == '\r' && _stream.peek() == '\n') {
token.text = buffer.str();
} else {
buffer << chr;
}
}
}
_current = token;
return _current;
}
/*
std::pair<blitz::tokentype, std::string> blitz::lexer::current() {
return _current;
}
std::pair<blitz::tokentype, std::string> blitz::lexer::next(std::istream& fs) {
std::stringstream buffer;
blitz::tokentype token;
enum class parserState {
DEFAULT,
TEXT,
NUMBER,
STRING,
COMMENT,
} state = parserState::DEFAULT;
while ((token == blitz::tokentype::TokenUnknown) && !fs.eof() && fs.good()) {
auto chr = fs.get();
}
}
/*
std::pair<blitz::lexer::token, std::string> blitz::lexer::next(std::shared_ptr<std::istream> fs) {
std::string buf;
Token tkn = Token::TokenEOF;
token tkn = token::TokenUnknown;
bool haveResult = false;
// Allow "overriding" the next retrieved Token.
if (m_overrideToken != Token::TokenUnknown) {
if (m_overrideToken != token::TokenUnknown) {
buf = m_overrideText;
tkn = m_overrideToken;
m_overrideToken = Token::TokenUnknown;
m_overrideToken = token::TokenUnknown;
haveResult = true;
}
@@ -56,11 +177,11 @@ std::pair<blitz::Lexer::Token, std::string> blitz::Lexer::GetNextToken(std::shar
char chr = fs->get();
if (chr == '\r' || chr == '\n') {
if (tkn != Token::TokenEOF) {
m_overrideToken = Token::TokenNewLine;
if (tkn != token::TokenEOF) {
m_overrideToken = token::TokenNewLine;
m_overrideText = "";
} else {
tkn = Token::TokenNewLine;
tkn = token::TokenNewLine;
buf = "";
}
@@ -71,10 +192,10 @@ std::pair<blitz::Lexer::Token, std::string> blitz::Lexer::GetNextToken(std::shar
break;
} else if (m_isStringMode) {
if (chr == '\"') {
m_overrideToken = Token::TokenDoubleQuote;
m_overrideToken = token::TokenDoubleQuote;
m_overrideText = chr;
m_isStringMode = false;
tkn = Token::TokenQuotedText;
tkn = token::TokenQuotedText;
break;
} else if (iscntrl(chr) || !isprint(chr)) {
fs->putback(chr);
@@ -97,7 +218,7 @@ std::pair<blitz::Lexer::Token, std::string> blitz::Lexer::GetNextToken(std::shar
} else if (chr == '.') {
if (m_numberModeHasDecimal == false) {
m_numberModeHasDecimal = true;
tkn = Token::TokenDecimal;
tkn = token::TokenDecimal;
buf += chr;
} else {
fs->putback(chr);
@@ -111,7 +232,7 @@ std::pair<blitz::Lexer::Token, std::string> blitz::Lexer::GetNextToken(std::shar
}
} else if (m_isCommentMode) {
buf += chr;
tkn = Token::TokenComment;
tkn = token::TokenComment;
} else {
// Whitespace
if (isspace(chr))
@@ -119,7 +240,7 @@ std::pair<blitz::Lexer::Token, std::string> blitz::Lexer::GetNextToken(std::shar
// Control Code
if (iscntrl(chr)) {
tkn = Token::TokenUnknown;
tkn = token::TokenUnknown;
buf = chr;
}
@@ -129,13 +250,13 @@ std::pair<blitz::Lexer::Token, std::string> blitz::Lexer::GetNextToken(std::shar
if (isdigit(chr2)) {
m_isNumberMode = true;
m_numberModeHasDecimal = false;
tkn = Token::TokenNumber;
tkn = token::TokenNumber;
buf = chr + chr2;
break;
} else if (chr2 == '.') {
m_isNumberMode = true;
m_numberModeHasDecimal = true;
tkn = Token::TokenDecimal;
tkn = token::TokenDecimal;
buf = chr + "0" + chr2;
break;
} else {
@@ -151,7 +272,7 @@ std::pair<blitz::Lexer::Token, std::string> blitz::Lexer::GetNextToken(std::shar
break;
}
}
if (tkn != Token::TokenEOF) {
if (tkn != token::TokenEOF) {
haveResult = true;
break;
}
@@ -159,30 +280,30 @@ std::pair<blitz::Lexer::Token, std::string> blitz::Lexer::GetNextToken(std::shar
// Strings, Text, Numbers
if (chr == ';') {
m_isCommentMode = true;
tkn = Token::TokenSemicolon;
tkn = token::TokenSemicolon;
buf = chr;
break;
} else if (chr == '\"') {
m_isStringMode = true;
tkn = Token::TokenDoubleQuote;
tkn = token::TokenDoubleQuote;
buf = chr;
break;
} else if (isalpha(chr)) {
m_isTextMode = true;
tkn = Token::TokenText;
tkn = token::TokenText;
buf = chr;
} else if (isdigit(chr)) {
m_isNumberMode = true;
m_numberModeHasDecimal = false;
tkn = Token::TokenNumber;
tkn = token::TokenNumber;
buf = chr;
} else if (chr == '.') {
m_isNumberMode = true;
m_numberModeHasDecimal = true;
tkn = Token::TokenDecimal;
tkn = token::TokenDecimal;
buf = "0" + chr;
} else {
tkn = Token::TokenUnknown;
tkn = token::TokenUnknown;
buf = chr;
break;
}
@@ -190,90 +311,91 @@ std::pair<blitz::Lexer::Token, std::string> blitz::Lexer::GetNextToken(std::shar
}
// Convert from Text into native Token.
if (tkn == Token::TokenText)
tkn = ConvertTextToToken(tkn, buf);
if (tkn == token::TokenText)
tkn = to_token(tkn, buf);
return std::make_pair(tkn, buf);
}
blitz::Lexer::Token blitz::Lexer::ConvertTextToToken(Token in, std::string text) {
static std::pair<const char*, Token> l_textToTokenList[] = {
blitz::lexer::token blitz::lexer::to_token(token in, std::string text) {
static std::pair<const char*, token> l_textToTokenList[] = {
// Binary
{ "not", Token::TokenNot },
{ "and", Token::TokenAnd },
{ "or", Token::TokenOr },
{ "xor", Token::TokenXor },
{ "shl", Token::TokenShl },
{ "shr", Token::TokenShr },
{ "sal", Token::TokenSal },
{ "sar", Token::TokenSar },
{ "false", Token::TokenFalse },
{ "true", Token::TokenTrue },
{ "not", token::TokenNot },
{ "and", token::TokenAnd },
{ "or", token::TokenOr },
{ "xor", token::TokenXor },
{ "shl", token::TokenShl },
{ "shr", token::TokenShr },
{ "sal", token::TokenSal },
{ "sar", token::TokenSar },
{ "false", token::TokenFalse },
{ "true", token::TokenTrue },
// Conversion
{ "float", Token::TokenFloat },
{ "string", Token::TokenString },
{ "hex", Token::TokenHex },
{ "int", Token::TokenInt },
{ "float", token::TokenFloat },
{ "string", token::TokenString },
{ "hex", token::TokenHex },
{ "int", token::TokenInt },
// Control
{ "if", Token::TokenIf },
{ "then", Token::TokenThen },
{ "elseIf", Token::TokenElseIf },
{ "else", Token::TokenElse },
{ "endIf", Token::TokenEndIf },
{ "select", Token::TokenSelect },
{ "case", Token::TokenCase },
{ "default", Token::TokenDefault },
{ "goto", Token::TokenGoto },
{ "gosub", Token::TokenGosub },
{ "return", Token::TokenReturn },
{ "function", Token::TokenFunction },
{ "end", Token::TokenEnd },
{ "stop", Token::TokenStop },
{ "if", token::TokenIf },
{ "then", token::TokenThen },
{ "elseif", token::TokenElseIf },
{ "else", token::TokenElse },
{ "endif", token::TokenEndIf },
{ "select", token::TokenSelect },
{ "case", token::TokenCase },
{ "default", token::TokenDefault },
{ "goto", token::TokenGoto },
{ "gosub", token::TokenGosub },
{ "return", token::TokenReturn },
{ "function", token::TokenFunction },
{ "end", token::TokenEnd },
{ "stop", token::TokenStop },
// Loop
{ "for", Token::TokenFor },
{ "to", Token::TokenTo },
{ "next", Token::TokenNext },
{ "while", Token::TokenWhile },
{ "wend", Token::TokenWend },
{ "repeat", Token::TokenRepeat },
{ "until", Token::TokenUntil },
{ "forever", Token::TokenForever },
{ "exit", Token::TokenExit },
{ "for", token::TokenFor },
{ "to", token::TokenTo },
{ "next", token::TokenNext },
{ "while", token::TokenWhile },
{ "wend", token::TokenWend },
{ "repeat", token::TokenRepeat },
{ "until", token::TokenUntil },
{ "forever", token::TokenForever },
{ "exit", token::TokenExit },
// Math
{ "abs", Token::TokenAbs },
{ "sign", Token::TokenSign },
{ "cos", Token::TokenCos },
{ "sin", Token::TokenSin },
{ "tan", Token::TokenTan },
{ "acos", Token::TokenACos },
{ "asin", Token::TokenASin },
{ "atan", Token::TokenATan },
{ "atan2", Token::TokenATan2 },
{ "log", Token::TokenLog },
{ "log10", Token::TokenLog10 },
{ "ceil", Token::TokenCeil },
{ "floor", Token::TokenFloor },
{ "mod", Token::TokenMod },
{ "pi", Token::TokenPi },
{ "exp", Token::TokenExp },
{ "sqr", Token::TokenSqr },
{ "abs", token::TokenAbs },
{ "sign", token::TokenSign },
{ "cos", token::TokenCos },
{ "sin", token::TokenSin },
{ "tan", token::TokenTan },
{ "acos", token::TokenACos },
{ "asin", token::TokenASin },
{ "atan", token::TokenATan },
{ "atan2", token::TokenATan2 },
{ "log", token::TokenLog },
{ "log10", token::TokenLog10 },
{ "ceil", token::TokenCeil },
{ "floor", token::TokenFloor },
{ "mod", token::TokenMod },
{ "pi", token::TokenPi },
{ "exp", token::TokenExp },
{ "sqr", token::TokenSqr },
// Variables
{ "const", Token::TokenConst },
{ "global", Token::TokenGlobal },
{ "local", Token::TokenLocal },
{ "const", token::TokenConst },
{ "global", token::TokenGlobal },
{ "local", token::TokenLocal },
// Includes
{ "include", Token::TokenInclude },
{ "include", token::TokenInclude },
};
for (auto v : l_textToTokenList) {
if (boost::iequals(text, v.first)) {
if (stricmp(text.c_str(), v.first)) {
return v.second;
}
}
return in;
}
*/