BlitzLLVM/code_compiler/source/lexer.cpp

/// AUTOGENERATED COPYRIGHT HEADER START
// Copyright (C) 2017-2024 Michael Fabian 'Xaymar' Dirks <info@xaymar.com>
// AUTOGENERATED COPYRIGHT HEADER END
#include "lexer.hpp"
#include <codecvt>
#include <cstdarg>
#include <sstream>

std::string format(const char* format, ...)
{
	va_list arg1;
	va_list arg2;
	va_start(arg1, format);
	va_copy(arg2, arg1);
	int               length = vsnprintf(nullptr, 0, format, arg1);
	std::vector<char> buffer(length + 1);
	vsnprintf(buffer.data(), buffer.size(), format, arg2);
	va_end(arg1);
	va_end(arg2);
	return {buffer.data(), buffer.data() + length};
}

std::string blitz::token::to_string()
{
	std::string name;
	switch (type) {
	case variant::UNKNOWN:
		name = "Unknown";
		break;
	case variant::ENDOFFILE:
		name = "EndOfFile";
		break;
	case variant::NEWLINE:
		name = "NewLine";
		break;
	case variant::CONTROL:
		name = "Control";
		break;
	case variant::COMMENT:
		name = "Comment";
		break;
	case variant::TEXT:
		name = "Text";
		break;
	case variant::STRING:
		name = "String";
		break;
	case variant::INTEGER:
		name = "Integer";
		break;
	case variant::REAL:
		name = "Real";
		break;
	case variant::SYMBOL:
		name = "Symbol";
		break;
	default:
		name = "How the fuck?!";
		break;
	}

	if (type == variant::NEWLINE || type == variant::CONTROL) {
		return format("%s(%llu@%llu, %d)", name.c_str(), location.first, location.second, text[0]);
	} else {
		return format("%s(%llu@%llu, %s)", name.c_str(), location.first, location.second, text.c_str());
	}
}

blitz::lexer::~lexer() {}

blitz::lexer::lexer(std::filesystem::path file)
{
	// Usually files start at line 1 and character 1, so we should start there too.
	_location = {1, 1};

	// Try and open the file for reading.
	_file   = file;
	_stream = std::ifstream(_file, std::ios_base::binary); // We use binary so we can eventually support UTF-8.
	if (!_stream.good() || _stream.eof() || _stream.bad() || _stream.fail()) {
		throw std::runtime_error(format("Reading file '%s' failed.", file.generic_string().c_str()));
	}

	// Initialize token storage to a default token.
	_override = _current = blitz::token{
		.location = {0, 0},
		.text     = "",
		.type     = token::variant::UNKNOWN,
	};
}

blitz::token blitz::lexer::current()
{
	return _current;
}

blitz::token blitz::lexer::next()
{
	enum class stage {
		DEFAULT,
		TEXT,
		NUMBER,
		STRING,
		COMMENT,
	} state = stage::DEFAULT;

	std::stringstream buffer;
	blitz::token      token{
			 .location = _location,
			 .text     = "",
			 .type     = blitz::token::variant::UNKNOWN,
    };

	auto issymbol = [](int chr) {
		switch (chr) {
		case ';': // Comment
		case ':': // Command Separator
		case '=': // Equal
		case '<': // Less Than
		case '>': // Greater Than
		case '~': // Bitwise Not
		case '^': // Exponential (X ^ Y = pow(X, Y))
		case '+': // Plus
		case '-': // Minus
		case '*': // Multiply
		case '/': // Divide
		case ',': // Parameter Separation
		case '%': // Integer Type
		case '#': // Real Type
		case '$': // String Type
		case '.': // Structured Type
		case '\\': // Structured Type Access
			// Blitz Arrays
		case '[':
		case ']':
			// Call, Grouping, Dim
		case '(':
		case ')':
			return true;
		default:
			return false;
		}
		return false;
	};
	auto iswhitespace = [](int chr) {
		switch (chr) {
		case ' ':
		case '\t':
			return true;
		default:
			return false;
		}
		return false;
	};

	// ToDo: Figure out why we don't ever hit chr == EOF.
	if (_stream.eof()) {
		token.location = _location;
		token.type     = blitz::token::variant::ENDOFFILE;
		return token;
	}

	bool complete = false;
	while (!complete && _stream.good()) {
		// Peek at the current byte, without advancing the read pointer until we need to.
		auto chr           = _stream.peek();
		bool is_newline    = (chr == '\r') || (chr == '\n');
		bool is_returnfeed = (chr == '\r');

		if (state == stage::DEFAULT) {
			if (chr == EOF) {
				token.type     = blitz::token::variant::ENDOFFILE;
				token.text     = "";
				token.location = _location;
				complete       = true;
				_location.second++;
			} else if (is_newline) {
				// New Line, should be handled like a control character, but with some special things.
				token.type     = blitz::token::variant::NEWLINE;
				token.text     = "\n";
				token.location = _location;
				complete       = true;

				// Advance the read pointer.
				_stream.get();

				// Is this a Windows-style \r\n?
				if (is_returnfeed && (_stream.peek() == '\n')) {
					// If so, advance the read pointer again.
					_stream.get();
				}

				// Then update the location.
				_location.first++;
				_location.second = 1;
			} else if (iswhitespace(chr)) {
				// This is white space, which we'll happily ignore.
				_stream.get();
				_location.second++;
			} else if (chr < 32) {
				// Likely to be a control character.
				token.location = _location;
				token.type     = blitz::token::variant::CONTROL;
				token.text     = {1, char(chr)};
				complete       = true;
				_stream.get();
				_location.second++;
			} else if (chr == ':') {
				// Allows code writers to pretend it's all one line.
				token.location = _location;
				token.type     = blitz::token::variant::SEPARATOR;
				token.text     = {1, char(chr)};
				complete       = true;
				_stream.get();
				_location.second++;
			} else if (chr == ';') {
				// A comment, which ends at the next new line.
				state          = stage::COMMENT;
				token.location = _location;
				token.type     = blitz::token::variant::COMMENT;
			} else if (isdigit(chr)) {
				// Probably an Integer, or if the latter, it's a Real.
				state          = stage::NUMBER;
				token.location = _location;
				token.type     = blitz::token::variant::INTEGER;
			} else if (isalpha(chr)) {
				// Text of some kind.
				state          = stage::TEXT;
				token.location = _location;
				token.type     = blitz::token::variant::TEXT;
			} else if (chr == '"') {
				// A quoted string.
				state          = stage::STRING;
				token.location = _location;
				token.type     = blitz::token::variant::STRING;

				// Advance so we actually get anywhere.
				_stream.get();
				_location.second++;
			} else if (issymbol(chr)) {
				// Special Handling for a few symbols that could mean multiple things.
				if (chr == '.') { // '.' can start a Real, Label or Structured Type Access. We don't want to decide on the latter here, that's a parser thing.
					buffer << (char)chr;

					// We advance the read pointer here to look at what's coming next.
					_stream.get();
					chr = _stream.peek();
					_location.second++;

					// Peek at what's coming next.
					if (isdigit(chr)) {
						// This is a Real number.
						token.location = _location;
						token.type     = blitz::token::variant::REAL;
						state          = stage::NUMBER;
					} else {
						// Assume this is a symbol and return to normal behavior.
						token.location = _location;
						token.text     = buffer.str();
						token.type     = blitz::token::variant::SYMBOL;
						complete       = true;
					}
				} else if ((chr == '+') || (chr == '-')) { // '+' & '-' could be prefixes to an Integer or Real.
					buffer << (char)chr;

					// Advance the read pointer to peek at the future.
					_stream.get();
					chr = _stream.peek();
					_location.second++;

					// Peek at what's coming up.
					if (isdigit(chr) || (chr == '.')) { // Likely to be a Real or Integer.
						token.location = _location;
						if (chr == '.') {
							token.type = blitz::token::variant::REAL;
						} else {
							token.type = blitz::token::variant::INTEGER;
						}
						state = stage::NUMBER;
					} else {
						token.location = _location;
						token.text     = buffer.str();
						token.type     = blitz::token::variant::SYMBOL;
						complete       = true;
					}
				} else {
					token.location = _location;
					token.text     = {1, char(chr)};
					token.type     = blitz::token::variant::SYMBOL;
					complete       = true;

					// Advance so we actually get anywhere.
					_stream.get();
					_location.second++;
				}
			} else {
				// Everything else is an error
				throw blitz::error(_file, _location, _location, "You've encountered a bug. Please report this with the file that caused it.");
			}
		} else if (state == stage::NUMBER) {
			if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || (chr == ';')) {
				// EOF, Control, NL, Whitespace, and Comments should return to default parsing.
				complete = true;
			} else if (isdigit(chr) || (chr == '.')) {
				_stream.get();
				buffer << (char)chr;
				if (chr == '.') {
					if (token.type != token::variant::REAL) {
						token.type = blitz::token::variant::REAL;
					} else {
						token.text = buffer.str();
						throw blitz::error(_file, token.location, _location, format("In token %s: Expected [0-9], got '%s' instead.", token.to_string().c_str(), std::string{1, (char)chr}.c_str()));
					}
				}
			} else if (issymbol(chr)) {
				complete = true;
			} else {
				token.text = buffer.str();
				throw blitz::error(_file, token.location, _location, format("In token %s: Expected [0-9.], got '%s' instead.", token.to_string().c_str(), std::string{1, (char)chr}.c_str()));
			}

			if (complete) {
				token.text = buffer.str();
			}
		} else if (state == stage::TEXT) {
			if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || issymbol(chr)) {
				// Return to default parsing.
				complete = true;
			} else if (isalpha(chr) || isdigit(chr) || (chr == '_')) {
				buffer << (char)chr;
				_stream.get();
				_location.second++;
			} else {
				token.text = buffer.str();
				throw blitz::error(_file, token.location, _location, format("In token %s: Expected [a-zA-Z0-9_], got '%s' instead.", token.to_string().c_str(), std::string{1, (char)chr}.c_str()));
			}

			if (complete) {
				token.text = buffer.str();
			}
		} else if (state == stage::STRING) {
			if ((chr == EOF) || (chr < 32) || is_newline) {
				// Return to default parsing.
				complete = true;
			} else if (chr == '"') { // The only true way to end a string.
				complete = true;

				// Skip over the " so we don't confuse the parser.
				_stream.get();
				_location.second++;
			} else {
				buffer << (char)chr;
				_stream.get();
				_location.second++;
			}

			if (complete) {
				token.text = buffer.str();
			}
		} else if (state == stage::COMMENT) {
			if ((chr == EOF) || (chr < 32) || is_newline) {
				// Return to default parsing at this point.
				complete = true;
			} else {
				buffer << (char)chr;
				_stream.get();
				_location.second++;
			}

			if (complete) {
				token.text = buffer.str();
			}
		}
	}

	_current = token;
	return _current;
}

/*
std::pair<blitz::tokentype, std::string> blitz::lexer::current() {
	return _current;
}

std::pair<blitz::tokentype, std::string> blitz::lexer::next(std::istream& fs) {
	std::stringstream buffer;
	blitz::tokentype token;

	enum class parserState {
		DEFAULT,
		TEXT,
		NUMBER,
		STRING,
		COMMENT,
	} state = parserState::DEFAULT;

	while ((token == blitz::tokentype::TokenUnknown) && !fs.eof() && fs.good()) {
		auto chr = fs.get();


	}

}


/*
std::pair<blitz::lexer::token, std::string> blitz::lexer::next(std::shared_ptr<std::istream> fs) {
	std::string buf;
	token tkn = token::TokenUnknown;
	bool haveResult = false;

	// Allow "overriding" the next retrieved Token.
	if (m_overrideToken != token::TokenUnknown) {
		buf = m_overrideText;
		tkn = m_overrideToken;
		m_overrideToken = token::TokenUnknown;
		haveResult = true;
	}

	bool m_isTextMode = false;
	bool m_isNumberMode = false;
	bool m_isStringMode = false;
	bool m_isCommentMode = false;
	bool m_numberModeHasDecimal = false;
	while (((fs->eof() == false) && (fs->good())) && !haveResult) {
		char chr = fs->get();

		if (chr == '\r' || chr == '\n') {
			if (tkn != token::TokenEOF) {
				m_overrideToken = token::TokenNewLine;
				m_overrideText = "";
			} else {
				tkn = token::TokenNewLine;
				buf = "";
			}

			m_isStringMode = false;
			m_isNumberMode = false;
			m_isTextMode = false;
			m_isCommentMode = false;
			break;
		} else if (m_isStringMode) {
			if (chr == '\"') {
				m_overrideToken = token::TokenDoubleQuote;
				m_overrideText = chr;
				m_isStringMode = false;
				tkn = token::TokenQuotedText;
				break;
			} else if (iscntrl(chr) || !isprint(chr)) {
				fs->putback(chr);
				m_isStringMode = false;
				break;
			} else {
				buf += chr;
			}
		} else if (m_isTextMode) {
			if (isalnum(chr) || (chr == '_')) {
				buf += chr;
			} else {
				fs->putback(chr);
				m_isTextMode = false;
				break;
			}
		} else if (m_isNumberMode) {
			if (isdigit(chr)) {
				buf += chr;
			} else if (chr == '.') {
				if (m_numberModeHasDecimal == false) {
					m_numberModeHasDecimal = true;
					tkn = token::TokenDecimal;
					buf += chr;
				} else {
					fs->putback(chr);
					m_isNumberMode = false;
					break;
				}
			} else {
				fs->putback(chr);
				m_isNumberMode = false;
				break;
			}
		} else if (m_isCommentMode) {
			buf += chr;
			tkn = token::TokenComment;
		} else {
			// Whitespace
			if (isspace(chr))
				continue;

			// Control Code
			if (iscntrl(chr)) {
				tkn = token::TokenUnknown;
				buf = chr;
			}

			// Special handling for + and -, due to numbers and decimals.
			if (chr == '+' || chr == '-') {
				char chr2 = fs->get();
				if (isdigit(chr2)) {
					m_isNumberMode = true;
					m_numberModeHasDecimal = false;
					tkn = token::TokenNumber;
					buf = chr + chr2;
					break;
				} else if (chr2 == '.') {
					m_isNumberMode = true;
					m_numberModeHasDecimal = true;
					tkn = token::TokenDecimal;
					buf = chr + "0" + chr2;
					break;
				} else {
					fs->putback(chr2);
				}
			}

			// Symbol
			for (auto v : g_symbolCharacters) {
				if (v.first == chr) {
					tkn = v.second;
					buf = v.first;
					break;
				}
			}
			if (tkn != token::TokenEOF) {
				haveResult = true;
				break;
			}

			// Strings, Text, Numbers
			if (chr == ';') {
				m_isCommentMode = true;
				tkn = token::TokenSemicolon;
				buf = chr;
				break;
			} else if (chr == '\"') {
				m_isStringMode = true;
				tkn = token::TokenDoubleQuote;
				buf = chr;
				break;
			} else if (isalpha(chr)) {
				m_isTextMode = true;
				tkn = token::TokenText;
				buf = chr;
			} else if (isdigit(chr)) {
				m_isNumberMode = true;
				m_numberModeHasDecimal = false;
				tkn = token::TokenNumber;
				buf = chr;
			} else if (chr == '.') {
				m_isNumberMode = true;
				m_numberModeHasDecimal = true;
				tkn = token::TokenDecimal;
				buf = "0" + chr;
			} else {
				tkn = token::TokenUnknown;
				buf = chr;
				break;
			}
		}
	}

	// Convert from Text into native Token.
	if (tkn == token::TokenText)
		tkn = to_token(tkn, buf);

	return std::make_pair(tkn, buf);
}

blitz::lexer::token blitz::lexer::to_token(token in, std::string text) {
	static std::pair<const char*, token> l_textToTokenList[] = {
		// Binary
		{ "not", token::TokenNot },
		{ "and", token::TokenAnd },
		{ "or", token::TokenOr },
		{ "xor", token::TokenXor },
		{ "shl", token::TokenShl },
		{ "shr", token::TokenShr },
		{ "sal", token::TokenSal },
		{ "sar", token::TokenSar },
		{ "false", token::TokenFalse },
		{ "true", token::TokenTrue },

		// Conversion
		{ "float", token::TokenFloat },
		{ "string", token::TokenString },
		{ "hex", token::TokenHex },
		{ "int", token::TokenInt },

		// Control
		{ "if", token::TokenIf },
		{ "then", token::TokenThen },
		{ "elseif", token::TokenElseIf },
		{ "else", token::TokenElse },
		{ "endif", token::TokenEndIf },
		{ "select", token::TokenSelect },
		{ "case", token::TokenCase },
		{ "default", token::TokenDefault },
		{ "goto", token::TokenGoto },
		{ "gosub", token::TokenGosub },
		{ "return", token::TokenReturn },
		{ "function", token::TokenFunction },
		{ "end", token::TokenEnd },
		{ "stop", token::TokenStop },

		// Loop
		{ "for", token::TokenFor },
		{ "to", token::TokenTo },
		{ "next", token::TokenNext },
		{ "while", token::TokenWhile },
		{ "wend", token::TokenWend },
		{ "repeat", token::TokenRepeat },
		{ "until", token::TokenUntil },
		{ "forever", token::TokenForever },
		{ "exit", token::TokenExit },

		// Math
		{ "abs", token::TokenAbs },
		{ "sign", token::TokenSign },
		{ "cos", token::TokenCos },
		{ "sin", token::TokenSin },
		{ "tan", token::TokenTan },
		{ "acos", token::TokenACos },
		{ "asin", token::TokenASin },
		{ "atan", token::TokenATan },
		{ "atan2", token::TokenATan2 },
		{ "log", token::TokenLog },
		{ "log10", token::TokenLog10 },
		{ "ceil", token::TokenCeil },
		{ "floor", token::TokenFloor },
		{ "mod", token::TokenMod },
		{ "pi", token::TokenPi },
		{ "exp", token::TokenExp },
		{ "sqr", token::TokenSqr },

		// Variables
		{ "const", token::TokenConst },
		{ "global", token::TokenGlobal },
		{ "local", token::TokenLocal },

		// Includes
		{ "include", token::TokenInclude },
	};
	for (auto v : l_textToTokenList) {
		if (stricmp(text.c_str(), v.first)) {
			return v.second;
		}
	}
	return in;
}
*/