Files

370 lines
11 KiB
C++
Raw Permalink Normal View History

2024-06-25 18:59:15 +02:00
/// AUTOGENERATED COPYRIGHT HEADER START
// Copyright (C) 2017-2025 Michael Fabian 'Xaymar' Dirks <info@xaymar.com>
2024-06-25 18:59:15 +02:00
// AUTOGENERATED COPYRIGHT HEADER END
2017-11-13 02:14:57 +01:00
#include "lexer.hpp"
#include <codecvt>
2024-06-26 00:31:06 +02:00
#include <cstdarg>
2024-06-25 18:59:15 +02:00
#include <sstream>
#include "util.hpp"
2017-11-13 02:14:57 +01:00
2024-06-26 00:31:06 +02:00
std::string blitz::token::to_string()
{
std::string name;
switch (type) {
case variant::UNKNOWN:
name = "Unknown";
break;
case variant::ENDOFFILE:
name = "EndOfFile";
break;
case variant::NEWLINE:
name = "NewLine";
break;
case variant::CONTROL:
name = "Control";
break;
case variant::COMMENT:
name = "Comment";
break;
case variant::TEXT:
name = "Text";
break;
case variant::STRING:
name = "String";
break;
case variant::INTEGER:
name = "Integer";
break;
case variant::REAL:
name = "Real";
break;
case variant::SYMBOL:
name = "Symbol";
break;
default:
2025-01-25 16:27:50 +01:00
name = "Invalid";
2024-06-26 00:31:06 +02:00
break;
}
if (type == variant::NEWLINE || type == variant::CONTROL) {
2025-01-25 16:27:50 +01:00
return blitz::format("%s(%llu@%llu, %d)", name.c_str(), location.first, location.second, text[0]);
2024-06-26 00:31:06 +02:00
} else {
2025-01-25 16:27:50 +01:00
return blitz::format("%s(%llu@%llu, %s)", name.c_str(), location.first, location.second, text.c_str());
2024-06-26 00:31:06 +02:00
}
}
2024-06-25 18:59:15 +02:00
2025-01-25 16:27:50 +01:00
bool blitz::token::operator==(variant rhs)
{
return type == rhs;
}
bool blitz::token::operator==(std::string const& rhs)
{
return text == rhs;
}
2024-06-25 18:59:15 +02:00
blitz::lexer::~lexer() {}
blitz::lexer::lexer(std::filesystem::path file)
{
2024-06-26 00:31:06 +02:00
// Usually files start at line 1 and character 1, so we should start there too.
_location = { 1, 1 };
2024-06-25 18:59:15 +02:00
// Try and open the file for reading.
_file = file;
_stream = std::ifstream(_file, std::ios_base::binary); // We use binary so we can eventually support UTF-8.
if (!_stream.good() || _stream.eof() || _stream.bad() || _stream.fail()) {
2025-01-25 16:27:50 +01:00
throw std::runtime_error(blitz::format("Reading file '%s' failed.", file.generic_string().c_str()));
2024-06-25 18:59:15 +02:00
}
// Initialize token storage to a default token.
_next = _current = blitz::token{
.location = { 0, 0 },
2024-06-26 00:31:06 +02:00
.text = "",
.type = token::variant::NONE,
2024-06-25 18:59:15 +02:00
};
2024-06-06 13:37:13 +02:00
}
2024-06-25 18:59:15 +02:00
blitz::token blitz::lexer::current()
{
return _current;
}
blitz::token blitz::lexer::next()
2025-01-25 16:27:50 +01:00
{
_current = peek();
_next = blitz::token{
.location = { 0, 0 },
.text = "",
.type = token::variant::NONE,
};
2025-01-25 16:27:50 +01:00
return _current;
}
blitz::token blitz::lexer::peek()
2024-06-25 18:59:15 +02:00
{
if (_next.type == blitz::token::variant::NONE) {
// ToDo: Optimize
enum class stage {
DEFAULT,
TEXT,
NUMBER,
STRING,
COMMENT,
} state = stage::DEFAULT;
2024-06-25 18:59:15 +02:00
std::stringstream buffer;
blitz::token token{
.location = _location,
.text = "",
.type = blitz::token::variant::UNKNOWN,
};
2024-06-26 00:31:06 +02:00
auto issymbol = [](int chr) { return blitz::utility::is_symbol(chr); };
auto iswhitespace = [](int chr) { return blitz::utility::is_white_space(chr); };
2024-06-26 00:31:06 +02:00
// ToDo: Figure out why we don't ever hit chr == EOF.
if (_stream.eof()) {
token.location = _location;
token.type = blitz::token::variant::ENDOFFILE;
return token;
}
bool complete = false;
while (!complete && _stream.good()) {
// Peek at the current byte, without advancing the read pointer until we need to.
auto chr = _stream.peek();
bool is_newline = (chr == '\r') || (chr == '\n');
bool is_returnfeed = (chr == '\r');
2024-06-26 00:31:06 +02:00
if (state == stage::DEFAULT) {
if (chr == EOF) {
token.type = blitz::token::variant::ENDOFFILE;
token.text = "";
token.location = _location;
complete = true;
_location.second++;
} else if (is_newline) {
// New Line, should be handled like a control character, but with some special things.
token.type = blitz::token::variant::NEWLINE;
token.text = "\n";
token.location = _location;
complete = true;
// Advance the read pointer.
2024-06-26 00:31:06 +02:00
_stream.get();
// Is this a Windows-style \r\n?
if (is_returnfeed && (_stream.peek() == '\n')) {
// If so, advance the read pointer again.
_stream.get();
}
// Then update the location.
_location.first++;
_location.second = 1;
} else if (iswhitespace(chr)) {
// This is white space, which we'll happily ignore.
_stream.get();
_location.second++;
} else if (chr < 32) {
// Likely to be a control character.
token.location = _location;
token.type = blitz::token::variant::CONTROL;
token.text = { 1, char(chr) };
complete = true;
_stream.get();
_location.second++;
/*} else if (chr == ':') {
2024-06-26 00:31:06 +02:00
// Allows code writers to pretend it's all one line.
token.location = _location;
token.type = blitz::token::variant::SEPARATOR;
token.text = {1, char(chr)};
complete = true;
_stream.get();
2025-01-25 16:27:50 +01:00
_location.second++;*/
} else if (chr == ';') {
// A comment, which ends at the next new line.
state = stage::COMMENT;
token.location = _location;
token.type = blitz::token::variant::COMMENT;
} else if (isdigit(chr)) {
// Probably an Integer, or if the latter, it's a Real.
state = stage::NUMBER;
token.location = _location;
token.type = blitz::token::variant::INTEGER;
} else if (isalpha(chr)) {
// Text of some kind.
state = stage::TEXT;
token.location = _location;
token.type = blitz::token::variant::TEXT;
} else if (chr == '"') {
// A quoted string.
state = stage::STRING;
token.location = _location;
token.type = blitz::token::variant::STRING;
2024-06-26 00:31:06 +02:00
// Advance so we actually get anywhere.
2024-06-26 00:31:06 +02:00
_stream.get();
_location.second++;
} else if (issymbol(chr)) {
// Special Handling for a few symbols that could mean multiple things.
if (chr == '.') { // '.' can start a Real, Label or Structured Type Access. We don't want to decide on the latter here, that's a parser thing.
buffer << (char)chr;
2024-06-26 00:31:06 +02:00
// We advance the read pointer here to look at what's coming next.
_stream.get();
chr = _stream.peek();
_location.second++;
// Peek at what's coming next.
if (isdigit(chr)) {
// This is a Real number.
token.location = _location;
token.type = blitz::token::variant::REAL;
state = stage::NUMBER;
} else {
// Assume this is a symbol and return to normal behavior.
token.location = _location;
token.text = buffer.str();
token.type = blitz::token::variant::SYMBOL;
complete = true;
}
} else if ((chr == '+') || (chr == '-')) { // '+' & '-' could be prefixes to an Integer or Real.
buffer << (char)chr;
// Advance the read pointer to peek at the future.
_stream.get();
chr = _stream.peek();
_location.second++;
// Peek at what's coming up.
if (isdigit(chr) || (chr == '.')) { // Likely to be a Real or Integer.
token.location = _location;
if (chr == '.') {
token.type = blitz::token::variant::REAL;
} else {
token.type = blitz::token::variant::INTEGER;
}
state = stage::NUMBER;
} else {
token.location = _location;
token.text = buffer.str();
token.type = blitz::token::variant::SYMBOL;
complete = true;
}
2024-06-26 00:31:06 +02:00
} else {
token.location = _location;
token.text = { char(chr) };
2024-06-26 00:31:06 +02:00
token.type = blitz::token::variant::SYMBOL;
complete = true;
// Advance so we actually get anywhere.
_stream.get();
_location.second++;
}
} else {
// Everything else is an error
throw blitz::error(_file, _location, _location, "You've encountered a bug. Please report this with the file that caused it.");
}
} else if (state == stage::NUMBER) {
if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || (chr == ';')) {
// EOF, Control, NL, Whitespace, and Comments should return to default parsing.
complete = true;
2025-02-12 00:03:19 +01:00
} else if (chr == 'f') {
_stream.get();
token.type = blitz::token::variant::REAL;
complete = true;
} else if (chr == 'u') {
_stream.get();
buffer << (char)chr;
token.type = blitz::token::variant::INTEGER;
complete = true;
} else if ((chr == 'b') || (chr == 'x')) {
_stream.get();
buffer << (char)chr;
if (buffer.tellp() > 2) {
throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected [0-9], got '%s' instead.", token.to_string().c_str(), std::string{ 1, (char)chr }.c_str()));
}
} else if (isdigit(chr) || (chr == '.')) {
2024-06-26 00:31:06 +02:00
_stream.get();
buffer << (char)chr;
if (chr == '.') {
if (token.type != token::variant::REAL) {
2024-06-26 00:31:06 +02:00
token.type = blitz::token::variant::REAL;
} else {
token.text = buffer.str();
throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected [0-9], got '%s' instead.", token.to_string().c_str(), std::string{ 1, (char)chr }.c_str()));
2024-06-26 00:31:06 +02:00
}
}
} else if (issymbol(chr)) {
complete = true;
2024-06-26 00:31:06 +02:00
} else {
token.text = buffer.str();
throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected ([0](b|x|))[0-9.], got '%s' instead.", token.to_string().c_str(), std::string{ 1, (char)chr }.c_str()));
}
2024-06-26 00:31:06 +02:00
if (complete) {
token.text = buffer.str();
}
} else if (state == stage::TEXT) {
if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || issymbol(chr)) {
// Return to default parsing.
complete = true;
} else if (isalpha(chr) || isdigit(chr) || (chr == '_')) {
buffer << (char)chr;
2024-06-26 00:31:06 +02:00
_stream.get();
_location.second++;
} else {
token.text = buffer.str();
throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected [a-zA-Z0-9_], got '%s' instead.", token.to_string().c_str(), std::string{ 1, (char)chr }.c_str()));
2024-06-26 00:31:06 +02:00
}
2024-06-25 18:59:15 +02:00
if (complete) {
token.text = buffer.str();
}
} else if (state == stage::STRING) {
if ((chr == EOF) || (chr < 32) || is_newline) {
// Return to default parsing.
complete = true;
} else if (chr == '"') { // The only true way to end a string.
complete = true;
2024-06-26 00:31:06 +02:00
// Skip over the " so we don't confuse the parser.
_stream.get();
_location.second++;
} else {
buffer << (char)chr;
_stream.get();
_location.second++;
}
2024-06-26 00:31:06 +02:00
if (complete) {
token.text = buffer.str();
}
} else if (state == stage::COMMENT) {
if ((chr == EOF) || (chr < 32) || is_newline) {
// Return to default parsing at this point.
complete = true;
} else {
buffer << (char)chr;
_stream.get();
_location.second++;
}
2024-06-26 00:31:06 +02:00
if (complete) {
token.text = buffer.str();
}
2024-06-25 18:59:15 +02:00
}
}
_next = token;
2024-06-25 18:59:15 +02:00
}
return _next;
}
std::filesystem::path blitz::lexer::file()
{
return std::filesystem::path(_file);
}