BlitzLLVM/code_compiler/source/lexer.cpp

/// AUTOGENERATED COPYRIGHT HEADER START
// Copyright (C) 2017-2025 Michael Fabian 'Xaymar' Dirks <info@xaymar.com>
// AUTOGENERATED COPYRIGHT HEADER END
#include "lexer.hpp"
#include <codecvt>
#include <cstdarg>
#include <sstream>
#include "util.hpp"

std::string blitz::token::to_string()
{
	std::string name;
	switch (type) {
	case variant::UNKNOWN:
		name = "Unknown";
		break;
	case variant::ENDOFFILE:
		name = "EndOfFile";
		break;
	case variant::NEWLINE:
		name = "NewLine";
		break;
	case variant::CONTROL:
		name = "Control";
		break;
	case variant::COMMENT:
		name = "Comment";
		break;
	case variant::TEXT:
		name = "Text";
		break;
	case variant::STRING:
		name = "String";
		break;
	case variant::INTEGER:
		name = "Integer";
		break;
	case variant::REAL:
		name = "Real";
		break;
	case variant::SYMBOL:
		name = "Symbol";
		break;
	default:
		name = "Invalid";
		break;
	}

	if (type == variant::NEWLINE || type == variant::CONTROL) {
		return blitz::format("%s(%llu@%llu, %d)", name.c_str(), location.first, location.second, text[0]);
	} else {
		return blitz::format("%s(%llu@%llu, %s)", name.c_str(), location.first, location.second, text.c_str());
	}
}

bool blitz::token::operator==(variant rhs)
{
	return type == rhs;
}

bool blitz::token::operator==(std::string const& rhs)
{
	return text == rhs;
}

blitz::lexer::~lexer() {}

blitz::lexer::lexer(std::filesystem::path file)
{
	// Usually files start at line 1 and character 1, so we should start there too.
	_location = { 1, 1 };

	// Try and open the file for reading.
	_file   = file;
	_stream = std::ifstream(_file, std::ios_base::binary); // We use binary so we can eventually support UTF-8.
	if (!_stream.good() || _stream.eof() || _stream.bad() || _stream.fail()) {
		throw std::runtime_error(blitz::format("Reading file '%s' failed.", file.generic_string().c_str()));
	}

	// Initialize token storage to a default token.
	_next = _current = blitz::token{
		.location = { 0, 0 },
		.text     = "",
		.type     = token::variant::NONE,
	};
}

blitz::token blitz::lexer::current()
{
	return _current;
}

blitz::token blitz::lexer::next()
{
	_current = peek();
	_next    = blitz::token{
		   .location = { 0, 0 },
		   .text     = "",
		   .type     = token::variant::NONE,
	};
	return _current;
}

blitz::token blitz::lexer::peek()
{
	if (_next.type == blitz::token::variant::NONE) {
		// ToDo: Optimize
		enum class stage {
			DEFAULT,
			TEXT,
			NUMBER,
			STRING,
			COMMENT,
		} state = stage::DEFAULT;

		std::stringstream buffer;
		blitz::token      token{
				 .location = _location,
				 .text     = "",
				 .type     = blitz::token::variant::UNKNOWN,
		};

		auto issymbol     = [](int chr) { return blitz::utility::is_symbol(chr); };
		auto iswhitespace = [](int chr) { return blitz::utility::is_white_space(chr); };

		// ToDo: Figure out why we don't ever hit chr == EOF.
		if (_stream.eof()) {
			token.location = _location;
			token.type     = blitz::token::variant::ENDOFFILE;
			return token;
		}

		bool complete = false;
		while (!complete && _stream.good()) {
			// Peek at the current byte, without advancing the read pointer until we need to.
			auto chr           = _stream.peek();
			bool is_newline    = (chr == '\r') || (chr == '\n');
			bool is_returnfeed = (chr == '\r');

			if (state == stage::DEFAULT) {
				if (chr == EOF) {
					token.type     = blitz::token::variant::ENDOFFILE;
					token.text     = "";
					token.location = _location;
					complete       = true;
					_location.second++;
				} else if (is_newline) {
					// New Line, should be handled like a control character, but with some special things.
					token.type     = blitz::token::variant::NEWLINE;
					token.text     = "\n";
					token.location = _location;
					complete       = true;

					// Advance the read pointer.
					_stream.get();

					// Is this a Windows-style \r\n?
					if (is_returnfeed && (_stream.peek() == '\n')) {
						// If so, advance the read pointer again.
						_stream.get();
					}

					// Then update the location.
					_location.first++;
					_location.second = 1;
				} else if (iswhitespace(chr)) {
					// This is white space, which we'll happily ignore.
					_stream.get();
					_location.second++;
				} else if (chr < 32) {
					// Likely to be a control character.
					token.location = _location;
					token.type     = blitz::token::variant::CONTROL;
					token.text     = { 1, char(chr) };
					complete       = true;
					_stream.get();
					_location.second++;
					/*} else if (chr == ':') {
				// Allows code writers to pretend it's all one line.
				token.location = _location;
				token.type     = blitz::token::variant::SEPARATOR;
				token.text     = {1, char(chr)};
				complete       = true;
				_stream.get();
				_location.second++;*/
				} else if (chr == ';') {
					// A comment, which ends at the next new line.
					state          = stage::COMMENT;
					token.location = _location;
					token.type     = blitz::token::variant::COMMENT;
				} else if (isdigit(chr)) {
					// Probably an Integer, or if the latter, it's a Real.
					state          = stage::NUMBER;
					token.location = _location;
					token.type     = blitz::token::variant::INTEGER;
				} else if (isalpha(chr)) {
					// Text of some kind.
					state          = stage::TEXT;
					token.location = _location;
					token.type     = blitz::token::variant::TEXT;
				} else if (chr == '"') {
					// A quoted string.
					state          = stage::STRING;
					token.location = _location;
					token.type     = blitz::token::variant::STRING;

					// Advance so we actually get anywhere.
					_stream.get();
					_location.second++;
				} else if (issymbol(chr)) {
					// Special Handling for a few symbols that could mean multiple things.
					if (chr == '.') { // '.' can start a Real, Label or Structured Type Access. We don't want to decide on the latter here, that's a parser thing.
						buffer << (char)chr;

						// We advance the read pointer here to look at what's coming next.
						_stream.get();
						chr = _stream.peek();
						_location.second++;

						// Peek at what's coming next.
						if (isdigit(chr)) {
							// This is a Real number.
							token.location = _location;
							token.type     = blitz::token::variant::REAL;
							state          = stage::NUMBER;
						} else {
							// Assume this is a symbol and return to normal behavior.
							token.location = _location;
							token.text     = buffer.str();
							token.type     = blitz::token::variant::SYMBOL;
							complete       = true;
						}
					} else if ((chr == '+') || (chr == '-')) { // '+' & '-' could be prefixes to an Integer or Real.
						buffer << (char)chr;

						// Advance the read pointer to peek at the future.
						_stream.get();
						chr = _stream.peek();
						_location.second++;

						// Peek at what's coming up.
						if (isdigit(chr) || (chr == '.')) { // Likely to be a Real or Integer.
							token.location = _location;
							if (chr == '.') {
								token.type = blitz::token::variant::REAL;
							} else {
								token.type = blitz::token::variant::INTEGER;
							}
							state = stage::NUMBER;
						} else {
							token.location = _location;
							token.text     = buffer.str();
							token.type     = blitz::token::variant::SYMBOL;
							complete       = true;
						}
					} else {
						token.location = _location;
						token.text     = { char(chr) };
						token.type     = blitz::token::variant::SYMBOL;
						complete       = true;

						// Advance so we actually get anywhere.
						_stream.get();
						_location.second++;
					}
				} else {
					// Everything else is an error
					throw blitz::error(_file, _location, _location, "You've encountered a bug. Please report this with the file that caused it.");
				}
			} else if (state == stage::NUMBER) {
				if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || (chr == ';')) {
					// EOF, Control, NL, Whitespace, and Comments should return to default parsing.
					complete = true;
				} else if (isdigit(chr) || (chr == '.') || (chr == 'b') || (chr == 'x')) {
					_stream.get();
					buffer << (char)chr;
					if (chr == '.') {
						if (token.type != token::variant::REAL) {
							token.type = blitz::token::variant::REAL;
						} else {
							token.text = buffer.str();
							throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected [0-9], got '%s' instead.", token.to_string().c_str(), std::string{ 1, (char)chr }.c_str()));
						}
					}
				} else if (issymbol(chr)) {
					complete = true;
				} else {
					token.text = buffer.str();
					throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected ([0](b|x|))[0-9.], got '%s' instead.", token.to_string().c_str(), std::string{ 1, (char)chr }.c_str()));
				}

				if (complete) {
					token.text = buffer.str();
				}
			} else if (state == stage::TEXT) {
				if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || issymbol(chr)) {
					// Return to default parsing.
					complete = true;
				} else if (isalpha(chr) || isdigit(chr) || (chr == '_')) {
					buffer << (char)chr;
					_stream.get();
					_location.second++;
				} else {
					token.text = buffer.str();
					throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected [a-zA-Z0-9_], got '%s' instead.", token.to_string().c_str(), std::string{ 1, (char)chr }.c_str()));
				}

				if (complete) {
					token.text = buffer.str();
				}
			} else if (state == stage::STRING) {
				if ((chr == EOF) || (chr < 32) || is_newline) {
					// Return to default parsing.
					complete = true;
				} else if (chr == '"') { // The only true way to end a string.
					complete = true;

					// Skip over the " so we don't confuse the parser.
					_stream.get();
					_location.second++;
				} else {
					buffer << (char)chr;
					_stream.get();
					_location.second++;
				}

				if (complete) {
					token.text = buffer.str();
				}
			} else if (state == stage::COMMENT) {
				if ((chr == EOF) || (chr < 32) || is_newline) {
					// Return to default parsing at this point.
					complete = true;
				} else {
					buffer << (char)chr;
					_stream.get();
					_location.second++;
				}

				if (complete) {
					token.text = buffer.str();
				}
			}
		}
		_next = token;
	}

	return _next;
}

std::filesystem::path blitz::lexer::file()
{
	return std::filesystem::path(_file);
}