Files
BlitzLLVM/code_compiler/source/lexer.cpp
T
Michael Fabian 'Xaymar' Dirks dfe3e88dbd Lexer done for now, moving on to ast
2024-06-26 00:31:45 +02:00

653 lines
17 KiB
C++

/// AUTOGENERATED COPYRIGHT HEADER START
// Copyright (C) 2017-2024 Michael Fabian 'Xaymar' Dirks <info@xaymar.com>
// AUTOGENERATED COPYRIGHT HEADER END
#include "lexer.hpp"
#include <codecvt>
#include <cstdarg>
#include <sstream>
std::string format(const char* format, ...)
{
va_list arg1;
va_list arg2;
va_start(arg1, format);
va_copy(arg2, arg1);
int length = vsnprintf(nullptr, 0, format, arg1);
std::vector<char> buffer(length + 1);
vsnprintf(buffer.data(), buffer.size(), format, arg2);
va_end(arg1);
va_end(arg2);
return {buffer.data(), buffer.data() + length};
}
std::string blitz::token::to_string()
{
std::string name;
switch (type) {
case variant::UNKNOWN:
name = "Unknown";
break;
case variant::ENDOFFILE:
name = "EndOfFile";
break;
case variant::NEWLINE:
name = "NewLine";
break;
case variant::CONTROL:
name = "Control";
break;
case variant::COMMENT:
name = "Comment";
break;
case variant::TEXT:
name = "Text";
break;
case variant::STRING:
name = "String";
break;
case variant::INTEGER:
name = "Integer";
break;
case variant::REAL:
name = "Real";
break;
case variant::SYMBOL:
name = "Symbol";
break;
default:
name = "How the fuck?!";
break;
}
if (type == variant::NEWLINE || type == variant::CONTROL) {
return format("%s(%llu@%llu, %d)", name.c_str(), location.first, location.second, text[0]);
} else {
return format("%s(%llu@%llu, %s)", name.c_str(), location.first, location.second, text.c_str());
}
}
blitz::lexer::~lexer() {}
blitz::lexer::lexer(std::filesystem::path file)
{
// Usually files start at line 1 and character 1, so we should start there too.
_location = {1, 1};
// Try and open the file for reading.
_file = file;
_stream = std::ifstream(_file, std::ios_base::binary); // We use binary so we can eventually support UTF-8.
if (!_stream.good() || _stream.eof() || _stream.bad() || _stream.fail()) {
throw std::runtime_error(format("Reading file '%s' failed.", file.generic_string().c_str()));
}
// Initialize token storage to a default token.
_override = _current = blitz::token{
.location = {0, 0},
.text = "",
.type = token::variant::UNKNOWN,
};
}
blitz::token blitz::lexer::current()
{
return _current;
}
blitz::token blitz::lexer::next()
{
enum class stage {
DEFAULT,
TEXT,
NUMBER,
STRING,
COMMENT,
} state = stage::DEFAULT;
std::stringstream buffer;
blitz::token token{
.location = _location,
.text = "",
.type = blitz::token::variant::UNKNOWN,
};
auto issymbol = [](int chr) {
switch (chr) {
case ';': // Comment
case ':': // Command Separator
case '=': // Equal
case '<': // Less Than
case '>': // Greater Than
case '~': // Bitwise Not
case '^': // Exponential (X ^ Y = pow(X, Y))
case '+': // Plus
case '-': // Minus
case '*': // Multiply
case '/': // Divide
case ',': // Parameter Separation
case '%': // Integer Type
case '#': // Real Type
case '$': // String Type
case '.': // Structured Type
case '\\': // Structured Type Access
// Blitz Arrays
case '[':
case ']':
// Call, Grouping, Dim
case '(':
case ')':
return true;
default:
return false;
}
return false;
};
auto iswhitespace = [](int chr) {
switch (chr) {
case ' ':
case '\t':
return true;
default:
return false;
}
return false;
};
// ToDo: Figure out why we don't ever hit chr == EOF.
if (_stream.eof()) {
token.location = _location;
token.type = blitz::token::variant::ENDOFFILE;
return token;
}
bool complete = false;
while (!complete && _stream.good()) {
// Peek at the current byte, without advancing the read pointer until we need to.
auto chr = _stream.peek();
bool is_newline = (chr == '\r') || (chr == '\n');
bool is_returnfeed = (chr == '\r');
if (state == stage::DEFAULT) {
if (chr == EOF) {
token.type = blitz::token::variant::ENDOFFILE;
token.text = "";
token.location = _location;
complete = true;
_location.second++;
} else if (is_newline) {
// New Line, should be handled like a control character, but with some special things.
token.type = blitz::token::variant::NEWLINE;
token.text = "\n";
token.location = _location;
complete = true;
// Advance the read pointer.
_stream.get();
// Is this a Windows-style \r\n?
if (is_returnfeed && (_stream.peek() == '\n')) {
// If so, advance the read pointer again.
_stream.get();
}
// Then update the location.
_location.first++;
_location.second = 1;
} else if (iswhitespace(chr)) {
// This is white space, which we'll happily ignore.
_stream.get();
_location.second++;
} else if (chr < 32) {
// Likely to be a control character.
token.location = _location;
token.type = blitz::token::variant::CONTROL;
token.text = {1, char(chr)};
complete = true;
_stream.get();
_location.second++;
} else if (chr == ':') {
// Allows code writers to pretend it's all one line.
token.location = _location;
token.type = blitz::token::variant::SEPARATOR;
token.text = {1, char(chr)};
complete = true;
_stream.get();
_location.second++;
} else if (chr == ';') {
// A comment, which ends at the next new line.
state = stage::COMMENT;
token.location = _location;
token.type = blitz::token::variant::COMMENT;
} else if (isdigit(chr)) {
// Probably an Integer, or if the latter, it's a Real.
state = stage::NUMBER;
token.location = _location;
token.type = blitz::token::variant::INTEGER;
} else if (isalpha(chr)) {
// Text of some kind.
state = stage::TEXT;
token.location = _location;
token.type = blitz::token::variant::TEXT;
} else if (chr == '"') {
// A quoted string.
state = stage::STRING;
token.location = _location;
token.type = blitz::token::variant::STRING;
// Advance so we actually get anywhere.
_stream.get();
_location.second++;
} else if (issymbol(chr)) {
// Special Handling for a few symbols that could mean multiple things.
if (chr == '.') { // '.' can start a Real, Label or Structured Type Access. We don't want to decide on the latter here, that's a parser thing.
buffer << (char)chr;
// We advance the read pointer here to look at what's coming next.
_stream.get();
chr = _stream.peek();
_location.second++;
// Peek at what's coming next.
if (isdigit(chr)) {
// This is a Real number.
token.location = _location;
token.type = blitz::token::variant::REAL;
state = stage::NUMBER;
} else {
// Assume this is a symbol and return to normal behavior.
token.location = _location;
token.text = buffer.str();
token.type = blitz::token::variant::SYMBOL;
complete = true;
}
} else if ((chr == '+') || (chr == '-')) { // '+' & '-' could be prefixes to an Integer or Real.
buffer << (char)chr;
// Advance the read pointer to peek at the future.
_stream.get();
chr = _stream.peek();
_location.second++;
// Peek at what's coming up.
if (isdigit(chr) || (chr == '.')) { // Likely to be a Real or Integer.
token.location = _location;
if (chr == '.') {
token.type = blitz::token::variant::REAL;
} else {
token.type = blitz::token::variant::INTEGER;
}
state = stage::NUMBER;
} else {
token.location = _location;
token.text = buffer.str();
token.type = blitz::token::variant::SYMBOL;
complete = true;
}
} else {
token.location = _location;
token.text = {1, char(chr)};
token.type = blitz::token::variant::SYMBOL;
complete = true;
// Advance so we actually get anywhere.
_stream.get();
_location.second++;
}
} else {
// Everything else is an error
throw blitz::error(_file, _location, _location, "You've encountered a bug. Please report this with the file that caused it.");
}
} else if (state == stage::NUMBER) {
if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || (chr == ';')) {
// EOF, Control, NL, Whitespace, and Comments should return to default parsing.
complete = true;
} else if (isdigit(chr) || (chr == '.')) {
_stream.get();
buffer << (char)chr;
if (chr == '.') {
if (token.type != token::variant::REAL) {
token.type = blitz::token::variant::REAL;
} else {
token.text = buffer.str();
throw blitz::error(_file, token.location, _location, format("In token %s: Expected [0-9], got '%s' instead.", token.to_string().c_str(), std::string{1, (char)chr}.c_str()));
}
}
} else if (issymbol(chr)) {
complete = true;
} else {
token.text = buffer.str();
throw blitz::error(_file, token.location, _location, format("In token %s: Expected [0-9.], got '%s' instead.", token.to_string().c_str(), std::string{1, (char)chr}.c_str()));
}
if (complete) {
token.text = buffer.str();
}
} else if (state == stage::TEXT) {
if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || issymbol(chr)) {
// Return to default parsing.
complete = true;
} else if (isalpha(chr) || isdigit(chr) || (chr == '_')) {
buffer << (char)chr;
_stream.get();
_location.second++;
} else {
token.text = buffer.str();
throw blitz::error(_file, token.location, _location, format("In token %s: Expected [a-zA-Z0-9_], got '%s' instead.", token.to_string().c_str(), std::string{1, (char)chr}.c_str()));
}
if (complete) {
token.text = buffer.str();
}
} else if (state == stage::STRING) {
if ((chr == EOF) || (chr < 32) || is_newline) {
// Return to default parsing.
complete = true;
} else if (chr == '"') { // The only true way to end a string.
complete = true;
// Skip over the " so we don't confuse the parser.
_stream.get();
_location.second++;
} else {
buffer << (char)chr;
_stream.get();
_location.second++;
}
if (complete) {
token.text = buffer.str();
}
} else if (state == stage::COMMENT) {
if ((chr == EOF) || (chr < 32) || is_newline) {
// Return to default parsing at this point.
complete = true;
} else {
buffer << (char)chr;
_stream.get();
_location.second++;
}
if (complete) {
token.text = buffer.str();
}
}
}
_current = token;
return _current;
}
/*
std::pair<blitz::tokentype, std::string> blitz::lexer::current() {
return _current;
}
std::pair<blitz::tokentype, std::string> blitz::lexer::next(std::istream& fs) {
std::stringstream buffer;
blitz::tokentype token;
enum class parserState {
DEFAULT,
TEXT,
NUMBER,
STRING,
COMMENT,
} state = parserState::DEFAULT;
while ((token == blitz::tokentype::TokenUnknown) && !fs.eof() && fs.good()) {
auto chr = fs.get();
}
}
/*
std::pair<blitz::lexer::token, std::string> blitz::lexer::next(std::shared_ptr<std::istream> fs) {
std::string buf;
token tkn = token::TokenUnknown;
bool haveResult = false;
// Allow "overriding" the next retrieved Token.
if (m_overrideToken != token::TokenUnknown) {
buf = m_overrideText;
tkn = m_overrideToken;
m_overrideToken = token::TokenUnknown;
haveResult = true;
}
bool m_isTextMode = false;
bool m_isNumberMode = false;
bool m_isStringMode = false;
bool m_isCommentMode = false;
bool m_numberModeHasDecimal = false;
while (((fs->eof() == false) && (fs->good())) && !haveResult) {
char chr = fs->get();
if (chr == '\r' || chr == '\n') {
if (tkn != token::TokenEOF) {
m_overrideToken = token::TokenNewLine;
m_overrideText = "";
} else {
tkn = token::TokenNewLine;
buf = "";
}
m_isStringMode = false;
m_isNumberMode = false;
m_isTextMode = false;
m_isCommentMode = false;
break;
} else if (m_isStringMode) {
if (chr == '\"') {
m_overrideToken = token::TokenDoubleQuote;
m_overrideText = chr;
m_isStringMode = false;
tkn = token::TokenQuotedText;
break;
} else if (iscntrl(chr) || !isprint(chr)) {
fs->putback(chr);
m_isStringMode = false;
break;
} else {
buf += chr;
}
} else if (m_isTextMode) {
if (isalnum(chr) || (chr == '_')) {
buf += chr;
} else {
fs->putback(chr);
m_isTextMode = false;
break;
}
} else if (m_isNumberMode) {
if (isdigit(chr)) {
buf += chr;
} else if (chr == '.') {
if (m_numberModeHasDecimal == false) {
m_numberModeHasDecimal = true;
tkn = token::TokenDecimal;
buf += chr;
} else {
fs->putback(chr);
m_isNumberMode = false;
break;
}
} else {
fs->putback(chr);
m_isNumberMode = false;
break;
}
} else if (m_isCommentMode) {
buf += chr;
tkn = token::TokenComment;
} else {
// Whitespace
if (isspace(chr))
continue;
// Control Code
if (iscntrl(chr)) {
tkn = token::TokenUnknown;
buf = chr;
}
// Special handling for + and -, due to numbers and decimals.
if (chr == '+' || chr == '-') {
char chr2 = fs->get();
if (isdigit(chr2)) {
m_isNumberMode = true;
m_numberModeHasDecimal = false;
tkn = token::TokenNumber;
buf = chr + chr2;
break;
} else if (chr2 == '.') {
m_isNumberMode = true;
m_numberModeHasDecimal = true;
tkn = token::TokenDecimal;
buf = chr + "0" + chr2;
break;
} else {
fs->putback(chr2);
}
}
// Symbol
for (auto v : g_symbolCharacters) {
if (v.first == chr) {
tkn = v.second;
buf = v.first;
break;
}
}
if (tkn != token::TokenEOF) {
haveResult = true;
break;
}
// Strings, Text, Numbers
if (chr == ';') {
m_isCommentMode = true;
tkn = token::TokenSemicolon;
buf = chr;
break;
} else if (chr == '\"') {
m_isStringMode = true;
tkn = token::TokenDoubleQuote;
buf = chr;
break;
} else if (isalpha(chr)) {
m_isTextMode = true;
tkn = token::TokenText;
buf = chr;
} else if (isdigit(chr)) {
m_isNumberMode = true;
m_numberModeHasDecimal = false;
tkn = token::TokenNumber;
buf = chr;
} else if (chr == '.') {
m_isNumberMode = true;
m_numberModeHasDecimal = true;
tkn = token::TokenDecimal;
buf = "0" + chr;
} else {
tkn = token::TokenUnknown;
buf = chr;
break;
}
}
}
// Convert from Text into native Token.
if (tkn == token::TokenText)
tkn = to_token(tkn, buf);
return std::make_pair(tkn, buf);
}
blitz::lexer::token blitz::lexer::to_token(token in, std::string text) {
static std::pair<const char*, token> l_textToTokenList[] = {
// Binary
{ "not", token::TokenNot },
{ "and", token::TokenAnd },
{ "or", token::TokenOr },
{ "xor", token::TokenXor },
{ "shl", token::TokenShl },
{ "shr", token::TokenShr },
{ "sal", token::TokenSal },
{ "sar", token::TokenSar },
{ "false", token::TokenFalse },
{ "true", token::TokenTrue },
// Conversion
{ "float", token::TokenFloat },
{ "string", token::TokenString },
{ "hex", token::TokenHex },
{ "int", token::TokenInt },
// Control
{ "if", token::TokenIf },
{ "then", token::TokenThen },
{ "elseif", token::TokenElseIf },
{ "else", token::TokenElse },
{ "endif", token::TokenEndIf },
{ "select", token::TokenSelect },
{ "case", token::TokenCase },
{ "default", token::TokenDefault },
{ "goto", token::TokenGoto },
{ "gosub", token::TokenGosub },
{ "return", token::TokenReturn },
{ "function", token::TokenFunction },
{ "end", token::TokenEnd },
{ "stop", token::TokenStop },
// Loop
{ "for", token::TokenFor },
{ "to", token::TokenTo },
{ "next", token::TokenNext },
{ "while", token::TokenWhile },
{ "wend", token::TokenWend },
{ "repeat", token::TokenRepeat },
{ "until", token::TokenUntil },
{ "forever", token::TokenForever },
{ "exit", token::TokenExit },
// Math
{ "abs", token::TokenAbs },
{ "sign", token::TokenSign },
{ "cos", token::TokenCos },
{ "sin", token::TokenSin },
{ "tan", token::TokenTan },
{ "acos", token::TokenACos },
{ "asin", token::TokenASin },
{ "atan", token::TokenATan },
{ "atan2", token::TokenATan2 },
{ "log", token::TokenLog },
{ "log10", token::TokenLog10 },
{ "ceil", token::TokenCeil },
{ "floor", token::TokenFloor },
{ "mod", token::TokenMod },
{ "pi", token::TokenPi },
{ "exp", token::TokenExp },
{ "sqr", token::TokenSqr },
// Variables
{ "const", token::TokenConst },
{ "global", token::TokenGlobal },
{ "local", token::TokenLocal },
// Includes
{ "include", token::TokenInclude },
};
for (auto v : l_textToTokenList) {
if (stricmp(text.c_str(), v.first)) {
return v.second;
}
}
return in;
}
*/