From b61005bcaa7c934b4f54f70fa184dc4b294e6930 Mon Sep 17 00:00:00 2001 From: Michael Fabian 'Xaymar' Dirks Date: Sat, 25 Jan 2025 19:25:14 +0100 Subject: [PATCH] More work on getting parsing to be functional --- .clang-format | 25 +- code_compiler/CMakeLists.txt | 6 +- code_compiler/source/ast/ast.cpp | 222 +++++++++++++- code_compiler/source/ast/ast.hpp | 51 ++-- code_compiler/source/error.cpp | 2 +- code_compiler/source/error.hpp | 2 +- code_compiler/source/lexer.cpp | 478 +++++++++++++++---------------- code_compiler/source/lexer.hpp | 8 +- code_compiler/source/main.cpp | 24 +- code_compiler/source/parser.cpp | 237 +++++++++------ code_compiler/source/parser.hpp | 2 +- code_compiler/source/types.cpp | 70 +++++ code_compiler/source/types.hpp | 52 ++++ code_compiler/source/util.cpp | 59 ++++ code_compiler/source/util.hpp | 13 + tests/test.bb | 4 + 16 files changed, 855 insertions(+), 400 deletions(-) create mode 100644 code_compiler/source/types.cpp create mode 100644 code_compiler/source/types.hpp create mode 100644 code_compiler/source/util.cpp create mode 100644 code_compiler/source/util.hpp create mode 100644 tests/test.bb diff --git a/.clang-format b/.clang-format index 00cafef..0d754a0 100644 --- a/.clang-format +++ b/.clang-format @@ -1,16 +1,17 @@ # AUTOGENERATED COPYRIGHT HEADER START -# Copyright (C) 2024 Michael Fabian 'Xaymar' Dirks +# Copyright (C) 2024-2025 Michael Fabian 'Xaymar' Dirks # AUTOGENERATED COPYRIGHT HEADER END # Basic Formatting TabWidth: 4 UseTab: ForContinuationAndIndentation ColumnLimit: 65535 +LineEnding: LF #- 0 does not respect the original line breaks! # Language Language: Cpp -Standard: c++17 +Standard: c++20 # Indentation AccessModifierOffset: 0 @@ -27,18 +28,10 @@ NamespaceIndentation: All IncludeCategories: - Regex: '^"warning-disable.hpp"$' Priority: 50 - - Regex: '^(<|")(config.hpp|common.hpp|ui-common.hpp|strings.hpp|version.hpp|obs.h)("|>)' - Priority: 100 - - Regex: '^ +# Copyright (C) 2017-2025 Michael Fabian 'Xaymar' Dirks # AUTOGENERATED COPYRIGHT HEADER END project(compiler VERSION ${PROJECT_VERSION} @@ -15,6 +15,10 @@ target_sources(${PROJECT_NAME} PRIVATE "source/error.cpp" "source/parser.hpp" "source/parser.cpp" + "source/util.hpp" + "source/util.cpp" + "source/types.hpp" + "source/types.cpp" "source/compiler.hpp" "source/compiler.cpp" "source/ast/ast.hpp" diff --git a/code_compiler/source/ast/ast.cpp b/code_compiler/source/ast/ast.cpp index b3fdbde..5cb45e9 100644 --- a/code_compiler/source/ast/ast.cpp +++ b/code_compiler/source/ast/ast.cpp @@ -1,14 +1,222 @@ // AUTOGENERATED COPYRIGHT HEADER START -// Copyright (C) 2017-2024 Michael Fabian 'Xaymar' Dirks +// Copyright (C) 2017-2025 Michael Fabian 'Xaymar' Dirks // AUTOGENERATED COPYRIGHT HEADER END #include "ast.hpp" +#include +#include #include -blitz::ast::variable::~variable() {} - -blitz::ast::variable::variable(blitz::token token) : _token(token), _value(nullptr) {} - -void blitz::ast::variable::set_value(std::shared_ptr value) +blitz::ast::variable::~variable() { - _value = value; + /* Variable Parsing + * + * Declaration: + * - 8bit Signed Integer Variable + * Variable:Byte + * Variable:Int8 + * - 8bit Unsigned Integer Variable + * Variable:UByte + * Variable:UInt8 + * - 16bit Signed Integer Variable + * Variable:Short + * Variable:Int16 + * - 16bit Unsigned Integer Variable + * Variable:UShort + * Variable:UInt16 + * - 32bit Signed Integer Variable + * Variable + * Variable% + * Variable:Int + * Variable:Int32 + * - 32bit Unsigned Integer Variable + * Variable:UInt + * Variable:UInt32 + * - 64bit Signed Integer Variable + * Variable%% + * Variable:Long + * Variable:Int64 + * - 64bit Unsigned Integer Variable + * Variable:ULong + * Variable:UInt64 + * - 32bit Real Variable + * Variable# + * Variable:Float + * Variable:Float32 + * Variable:Real + * Variable:Real32 + * - 64bit Real Variable + * Variable## + * Variable:Double + * Variable:Float64 + * Variable:Real64 + * - UTF-8 String Variable + * Variable$ + * Variable:String + * - Struct Variable + * Variable.StructName + * Variable:StructName + * + * Access: + * - Struct Access: + * Variable\Key + * - Array Access: + * Variable[IntegerIndex] + * - Dynamic Array Access: + * Variable(IntegerIndex) + * - Direct Access: + * Variable + */ +} + +bool blitz::ast::variable::can_parse(std::shared_ptr lexer) +{ + return lexer->current().type == blitz::token::variant::TEXT; +} + +std::shared_ptr blitz::ast::variable::try_parse(std::shared_ptr lexer) +{ + auto file = lexer->file(); + + auto name_tk = lexer->current(); + if (name_tk.type != blitz::token::variant::TEXT) { + throw blitz::error(file, name_tk.location, name_tk.location, blitz::format("Unexpected %s, expected text.", name_tk.to_string().c_str())); + } + + auto node = std::make_shared(); + node->tokens.push_back(name_tk); + node->type = blitz::types::type::UNKNOWN; + node->name = name_tk.text; + + // Check if this has a type definition + auto symbol_tk = lexer->peek(); + if (symbol_tk.type != blitz::token::variant::SYMBOL) { + return node; + } + if (symbol_tk.text == ":") { + // :Type + node->tokens.push_back(lexer->next()); // Advance to next token. + auto type_tk = lexer->next(); + if (type_tk != blitz::token::variant::TEXT) { + throw blitz::error(file, name_tk.location, type_tk.location, blitz::format("Unexpected %s, expected text.", type_tk.to_string().c_str())); + } + + auto type = blitz::types::from_string(type_tk.text); + if (type == blitz::types::type::UNKNOWN) { + throw blitz::error(file, name_tk.location, type_tk.location, blitz::format("Unexpected %s, expected built-in type name.", type_tk.text.c_str())); + } + node->tokens.push_back(type_tk); + + node->type = type; + } else if (symbol_tk.text == ".") { + // .Struct + node->tokens.push_back(lexer->next()); // Advance to next token. + auto type_tk = lexer->next(); + if (type_tk != blitz::token::variant::TEXT) { + throw blitz::error(file, name_tk.location, type_tk.location, blitz::format("Unexpected %s, expected text.", type_tk.to_string().c_str())); + } + node->tokens.push_back(type_tk); + + node->type = blitz::types::type::STRUCT; + node->struct_name = type_tk.text; + } else if (symbol_tk.text == "%") { + // Int32 + node->tokens.push_back(lexer->next()); // Advance to next token. + node->type = blitz::types::type::INT32; + } else if (symbol_tk.text == "#") { + // Float + node->tokens.push_back(lexer->next()); // Advance to next token. + node->type = blitz::types::type::FLOAT32; + } else if (symbol_tk.text == "$") { + // String + node->tokens.push_back(lexer->next()); // Advance to next token. + node->type = blitz::types::type::STRING; + } + + return node; +} + +blitz::ast::value::~value() {} + +bool blitz::ast::value::can_parse(std::shared_ptr lexer) +{ + auto tk = lexer->current(); + switch (tk.type) { + case blitz::token::variant::STRING: + case blitz::token::variant::REAL: + case blitz::token::variant::INTEGER: + return true; + case blitz::token::variant::STRING: { + // We can only parse True, False, Null + std::string text = tk.text; + std::transform(text.cbegin(), text.cend(), text.begin(), [](char from) { + if (from & 0b10000000) { // Exclude Unicode + return from; + } + return (char)std::tolower(from); + }); + if (tk.text == "false") { + return true; + } else if (tk.text == "true") { + return true; + } else if (tk.text == "null") { + return true; + } + break; + } + } + + return false; +} + +std::shared_ptr blitz::ast::value::try_parse(std::shared_ptr lexer) +{ + auto tk = lexer->current(); + auto utk = lexer->peek(); + + auto node = std::make_shared(); + node->type = variant::UNKNOWN; + + if (tk.type == blitz::token::variant::STRING) { + node->type = variant::STRING; + node->text = tk.text; + return node; + } else if (tk.type == blitz::token::variant::INTEGER) { + // Figure out which base this integer is in (and where it starts). + int base = 10; + const char* text = tk.text.c_str(); + if ((tk.text.length() > 1) && (text[0] == '0')) { + if (text[1] == 'x') { // Base 16 + base = 16; + text = text += 2; + } else if (text[1] == 'b') { // Base 2 + base = 2; + text = text += 2; + } else if (text[1] == '0') { + base = 8; + text = text += 1; + } + } + + if (utk.type == blitz::token::variant::TEXT && utk.text == "u") { + // User specific this is unsigned, so treat it as such. + node->type = variant::UNSIGNED_INTEGER; + node->number.ui = strtoull(text, nullptr, base); + if (errno == ERANGE) { + throw blitz::error(file, tk.location, tk.location, blitz::format("Value '%s' is not representable on this system.", tk.text.c_str())); + } + } else { + // Try and figure out if it is unsigned. + node->number.i = strtoll(text, nullptr, base); + if (errno == ERANGE) { + node->type = variant::UNSIGNED_INTEGER; + node->number.ui = strtoull(text, nullptr, base); + if (errno == ERANGE) { + throw blitz::error(file, tk.location, tk.location, blitz::format("Value '%s' is not representable on this system.", tk.text.c_str())); + } + } else { + node->type = variant::INTEGER; + } + } + } + } diff --git a/code_compiler/source/ast/ast.hpp b/code_compiler/source/ast/ast.hpp index ed68756..796f362 100644 --- a/code_compiler/source/ast/ast.hpp +++ b/code_compiler/source/ast/ast.hpp @@ -1,5 +1,5 @@ /// AUTOGENERATED COPYRIGHT HEADER START -// Copyright (C) 2017-2024 Michael Fabian 'Xaymar' Dirks +// Copyright (C) 2017-2025 Michael Fabian 'Xaymar' Dirks // AUTOGENERATED COPYRIGHT HEADER END #pragma once #include @@ -7,6 +7,7 @@ #include #include #include "../lexer.hpp" +#include "../types.hpp" // BlitzBasic Built-Ins // - Include: Followed by a String, which is the file to include at this location. @@ -30,34 +31,48 @@ namespace blitz { namespace ast { - class node { - public: + struct node { + std::vector tokens; + virtual ~node() = default; }; - class expression : public node {}; + struct variable : public node { + std::string name; + blitz::types::type type; + std::string struct_name; - class variable : public node { - blitz::token _token; - std::shared_ptr _value; - - public: virtual ~variable(); - variable(blitz::token token); - void set_value(std::shared_ptr value); + static bool can_parse(std::shared_ptr lexer); + static std::shared_ptr try_parse(std::shared_ptr lexer); }; - class call : public node {}; + struct value : public node { + enum class variant { + UNKNOWN, + NULL, + BOOL, + INTEGER, + UNSIGNED_INTEGER, + REAL, + STRING, + } type; + union { + bool b; + intmax_t i; + uintmax_t ui; + double f; + } number; + std::string text; - class local : public node { - public: - ~local(); - local(); + virtual ~value(); + + static bool can_parse(std::shared_ptr lexer); + static std::shared_ptr try_parse(std::shared_ptr lexer); }; - class global : public node {}; - + struct expression : public node {}; } // namespace ast } // namespace blitz diff --git a/code_compiler/source/error.cpp b/code_compiler/source/error.cpp index 73148b2..68c1436 100644 --- a/code_compiler/source/error.cpp +++ b/code_compiler/source/error.cpp @@ -1,5 +1,5 @@ // AUTOGENERATED COPYRIGHT HEADER START -// Copyright (C) 2024 Michael Fabian 'Xaymar' Dirks +// Copyright (C) 2024-2025 Michael Fabian 'Xaymar' Dirks // AUTOGENERATED COPYRIGHT HEADER END #include "error.hpp" #include diff --git a/code_compiler/source/error.hpp b/code_compiler/source/error.hpp index 86bb8ec..333fe6b 100644 --- a/code_compiler/source/error.hpp +++ b/code_compiler/source/error.hpp @@ -1,5 +1,5 @@ // AUTOGENERATED COPYRIGHT HEADER START -// Copyright (C) 2024 Michael Fabian 'Xaymar' Dirks +// Copyright (C) 2024-2025 Michael Fabian 'Xaymar' Dirks // AUTOGENERATED COPYRIGHT HEADER END #pragma once #include diff --git a/code_compiler/source/lexer.cpp b/code_compiler/source/lexer.cpp index 0f562ee..fe18c91 100644 --- a/code_compiler/source/lexer.cpp +++ b/code_compiler/source/lexer.cpp @@ -1,10 +1,11 @@ /// AUTOGENERATED COPYRIGHT HEADER START -// Copyright (C) 2017-2024 Michael Fabian 'Xaymar' Dirks +// Copyright (C) 2017-2025 Michael Fabian 'Xaymar' Dirks // AUTOGENERATED COPYRIGHT HEADER END #include "lexer.hpp" #include #include #include +#include "util.hpp" std::string blitz::token::to_string() { @@ -67,7 +68,7 @@ blitz::lexer::~lexer() {} blitz::lexer::lexer(std::filesystem::path file) { // Usually files start at line 1 and character 1, so we should start there too. - _location = {1, 1}; + _location = { 1, 1 }; // Try and open the file for reading. _file = file; @@ -77,10 +78,10 @@ blitz::lexer::lexer(std::filesystem::path file) } // Initialize token storage to a default token. - _override = _current = blitz::token{ - .location = {0, 0}, + _next = _current = blitz::token{ + .location = { 0, 0 }, .text = "", - .type = token::variant::UNKNOWN, + .type = token::variant::NONE, }; } @@ -92,121 +93,89 @@ blitz::token blitz::lexer::current() blitz::token blitz::lexer::next() { _current = peek(); + _next = blitz::token{ + .location = { 0, 0 }, + .text = "", + .type = token::variant::NONE, + }; return _current; } blitz::token blitz::lexer::peek() { - enum class stage { - DEFAULT, - TEXT, - NUMBER, - STRING, - COMMENT, - } state = stage::DEFAULT; + if (_next.type == blitz::token::variant::NONE) { + // ToDo: Optimize + enum class stage { + DEFAULT, + TEXT, + NUMBER, + STRING, + COMMENT, + } state = stage::DEFAULT; - std::stringstream buffer; - blitz::token token{ - .location = _location, - .text = "", - .type = blitz::token::variant::UNKNOWN, - }; + std::stringstream buffer; + blitz::token token{ + .location = _location, + .text = "", + .type = blitz::token::variant::UNKNOWN, + }; - auto issymbol = [](int chr) { - switch (chr) { - case ';': // Comment - case ':': // Command Separator - case '=': // Equal - case '<': // Less Than - case '>': // Greater Than - case '~': // Bitwise Not - case '^': // Exponential (X ^ Y = pow(X, Y)) - case '+': // Plus - case '-': // Minus - case '*': // Multiply - case '/': // Divide - case ',': // Parameter Separation - case '%': // Integer Type - case '#': // Real Type - case '$': // String Type - case '.': // Structured Type - case '\\': // Structured Type Access - // Blitz Arrays - case '[': - case ']': - // Call, Grouping, Dim - case '(': - case ')': - return true; - default: - return false; + auto issymbol = [](int chr) { return blitz::utility::is_symbol(chr); }; + auto iswhitespace = [](int chr) { return blitz::utility::is_white_space(chr); }; + + // ToDo: Figure out why we don't ever hit chr == EOF. + if (_stream.eof()) { + token.location = _location; + token.type = blitz::token::variant::ENDOFFILE; + return token; } - return false; - }; - auto iswhitespace = [](int chr) { - switch (chr) { - case ' ': - case '\t': - return true; - default: - return false; - } - return false; - }; - // ToDo: Figure out why we don't ever hit chr == EOF. - if (_stream.eof()) { - token.location = _location; - token.type = blitz::token::variant::ENDOFFILE; - return token; - } + bool complete = false; + while (!complete && _stream.good()) { + // Peek at the current byte, without advancing the read pointer until we need to. + auto chr = _stream.peek(); + bool is_newline = (chr == '\r') || (chr == '\n'); + bool is_returnfeed = (chr == '\r'); - bool complete = false; - while (!complete && _stream.good()) { - // Peek at the current byte, without advancing the read pointer until we need to. - auto chr = _stream.peek(); - bool is_newline = (chr == '\r') || (chr == '\n'); - bool is_returnfeed = (chr == '\r'); + if (state == stage::DEFAULT) { + if (chr == EOF) { + token.type = blitz::token::variant::ENDOFFILE; + token.text = ""; + token.location = _location; + complete = true; + _location.second++; + } else if (is_newline) { + // New Line, should be handled like a control character, but with some special things. + token.type = blitz::token::variant::NEWLINE; + token.text = "\n"; + token.location = _location; + complete = true; - if (state == stage::DEFAULT) { - if (chr == EOF) { - token.type = blitz::token::variant::ENDOFFILE; - token.text = ""; - token.location = _location; - complete = true; - _location.second++; - } else if (is_newline) { - // New Line, should be handled like a control character, but with some special things. - token.type = blitz::token::variant::NEWLINE; - token.text = "\n"; - token.location = _location; - complete = true; - - // Advance the read pointer. - _stream.get(); - - // Is this a Windows-style \r\n? - if (is_returnfeed && (_stream.peek() == '\n')) { - // If so, advance the read pointer again. + // Advance the read pointer. _stream.get(); - } - // Then update the location. - _location.first++; - _location.second = 1; - } else if (iswhitespace(chr)) { - // This is white space, which we'll happily ignore. - _stream.get(); - _location.second++; - } else if (chr < 32) { - // Likely to be a control character. - token.location = _location; - token.type = blitz::token::variant::CONTROL; - token.text = {1, char(chr)}; - complete = true; - _stream.get(); - _location.second++; - /*} else if (chr == ':') { + // Is this a Windows-style \r\n? + if (is_returnfeed && (_stream.peek() == '\n')) { + // If so, advance the read pointer again. + _stream.get(); + } + + // Then update the location. + _location.first++; + _location.second = 1; + } else if (iswhitespace(chr)) { + // This is white space, which we'll happily ignore. + _stream.get(); + _location.second++; + } else if (chr < 32) { + // Likely to be a control character. + token.location = _location; + token.type = blitz::token::variant::CONTROL; + token.text = { 1, char(chr) }; + complete = true; + _stream.get(); + _location.second++; + /*} else if (chr == ':') { // Allows code writers to pretend it's all one line. token.location = _location; token.type = blitz::token::variant::SEPARATOR; @@ -214,165 +183,172 @@ blitz::token blitz::lexer::peek() complete = true; _stream.get(); _location.second++;*/ - } else if (chr == ';') { - // A comment, which ends at the next new line. - state = stage::COMMENT; - token.location = _location; - token.type = blitz::token::variant::COMMENT; - } else if (isdigit(chr)) { - // Probably an Integer, or if the latter, it's a Real. - state = stage::NUMBER; - token.location = _location; - token.type = blitz::token::variant::INTEGER; - } else if (isalpha(chr)) { - // Text of some kind. - state = stage::TEXT; - token.location = _location; - token.type = blitz::token::variant::TEXT; - } else if (chr == '"') { - // A quoted string. - state = stage::STRING; - token.location = _location; - token.type = blitz::token::variant::STRING; - - // Advance so we actually get anywhere. - _stream.get(); - _location.second++; - } else if (issymbol(chr)) { - // Special Handling for a few symbols that could mean multiple things. - if (chr == '.') { // '.' can start a Real, Label or Structured Type Access. We don't want to decide on the latter here, that's a parser thing. - buffer << (char)chr; - - // We advance the read pointer here to look at what's coming next. - _stream.get(); - chr = _stream.peek(); - _location.second++; - - // Peek at what's coming next. - if (isdigit(chr)) { - // This is a Real number. - token.location = _location; - token.type = blitz::token::variant::REAL; - state = stage::NUMBER; - } else { - // Assume this is a symbol and return to normal behavior. - token.location = _location; - token.text = buffer.str(); - token.type = blitz::token::variant::SYMBOL; - complete = true; - } - } else if ((chr == '+') || (chr == '-')) { // '+' & '-' could be prefixes to an Integer or Real. - buffer << (char)chr; - - // Advance the read pointer to peek at the future. - _stream.get(); - chr = _stream.peek(); - _location.second++; - - // Peek at what's coming up. - if (isdigit(chr) || (chr == '.')) { // Likely to be a Real or Integer. - token.location = _location; - if (chr == '.') { - token.type = blitz::token::variant::REAL; - } else { - token.type = blitz::token::variant::INTEGER; - } - state = stage::NUMBER; - } else { - token.location = _location; - token.text = buffer.str(); - token.type = blitz::token::variant::SYMBOL; - complete = true; - } - } else { + } else if (chr == ';') { + // A comment, which ends at the next new line. + state = stage::COMMENT; token.location = _location; - token.text = {1, char(chr)}; - token.type = blitz::token::variant::SYMBOL; - complete = true; + token.type = blitz::token::variant::COMMENT; + } else if (isdigit(chr)) { + // Probably an Integer, or if the latter, it's a Real. + state = stage::NUMBER; + token.location = _location; + token.type = blitz::token::variant::INTEGER; + } else if (isalpha(chr)) { + // Text of some kind. + state = stage::TEXT; + token.location = _location; + token.type = blitz::token::variant::TEXT; + } else if (chr == '"') { + // A quoted string. + state = stage::STRING; + token.location = _location; + token.type = blitz::token::variant::STRING; // Advance so we actually get anywhere. _stream.get(); _location.second++; - } - } else { - // Everything else is an error - throw blitz::error(_file, _location, _location, "You've encountered a bug. Please report this with the file that caused it."); - } - } else if (state == stage::NUMBER) { - if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || (chr == ';')) { - // EOF, Control, NL, Whitespace, and Comments should return to default parsing. - complete = true; - } else if (isdigit(chr) || (chr == '.')) { - _stream.get(); - buffer << (char)chr; - if (chr == '.') { - if (token.type != token::variant::REAL) { - token.type = blitz::token::variant::REAL; + } else if (issymbol(chr)) { + // Special Handling for a few symbols that could mean multiple things. + if (chr == '.') { // '.' can start a Real, Label or Structured Type Access. We don't want to decide on the latter here, that's a parser thing. + buffer << (char)chr; + + // We advance the read pointer here to look at what's coming next. + _stream.get(); + chr = _stream.peek(); + _location.second++; + + // Peek at what's coming next. + if (isdigit(chr)) { + // This is a Real number. + token.location = _location; + token.type = blitz::token::variant::REAL; + state = stage::NUMBER; + } else { + // Assume this is a symbol and return to normal behavior. + token.location = _location; + token.text = buffer.str(); + token.type = blitz::token::variant::SYMBOL; + complete = true; + } + } else if ((chr == '+') || (chr == '-')) { // '+' & '-' could be prefixes to an Integer or Real. + buffer << (char)chr; + + // Advance the read pointer to peek at the future. + _stream.get(); + chr = _stream.peek(); + _location.second++; + + // Peek at what's coming up. + if (isdigit(chr) || (chr == '.')) { // Likely to be a Real or Integer. + token.location = _location; + if (chr == '.') { + token.type = blitz::token::variant::REAL; + } else { + token.type = blitz::token::variant::INTEGER; + } + state = stage::NUMBER; + } else { + token.location = _location; + token.text = buffer.str(); + token.type = blitz::token::variant::SYMBOL; + complete = true; + } } else { - token.text = buffer.str(); - throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected [0-9], got '%s' instead.", token.to_string().c_str(), std::string{1, (char)chr}.c_str())); + token.location = _location; + token.text = { char(chr) }; + token.type = blitz::token::variant::SYMBOL; + complete = true; + + // Advance so we actually get anywhere. + _stream.get(); + _location.second++; } + } else { + // Everything else is an error + throw blitz::error(_file, _location, _location, "You've encountered a bug. Please report this with the file that caused it."); + } + } else if (state == stage::NUMBER) { + if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || (chr == ';')) { + // EOF, Control, NL, Whitespace, and Comments should return to default parsing. + complete = true; + } else if (isdigit(chr) || (chr == '.') || (chr == 'b') || (chr == 'x')) { + _stream.get(); + buffer << (char)chr; + if (chr == '.') { + if (token.type != token::variant::REAL) { + token.type = blitz::token::variant::REAL; + } else { + token.text = buffer.str(); + throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected [0-9], got '%s' instead.", token.to_string().c_str(), std::string{ 1, (char)chr }.c_str())); + } + } + } else if (issymbol(chr)) { + complete = true; + } else { + token.text = buffer.str(); + throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected ([0](b|x|))[0-9.], got '%s' instead.", token.to_string().c_str(), std::string{ 1, (char)chr }.c_str())); } - } else if (issymbol(chr)) { - complete = true; - } else { - token.text = buffer.str(); - throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected [0-9.], got '%s' instead.", token.to_string().c_str(), std::string{1, (char)chr}.c_str())); - } - if (complete) { - token.text = buffer.str(); - } - } else if (state == stage::TEXT) { - if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || issymbol(chr)) { - // Return to default parsing. - complete = true; - } else if (isalpha(chr) || isdigit(chr) || (chr == '_')) { - buffer << (char)chr; - _stream.get(); - _location.second++; - } else { - token.text = buffer.str(); - throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected [a-zA-Z0-9_], got '%s' instead.", token.to_string().c_str(), std::string{1, (char)chr}.c_str())); - } + if (complete) { + token.text = buffer.str(); + } + } else if (state == stage::TEXT) { + if ((chr == EOF) || (chr < 32) || is_newline || iswhitespace(chr) || issymbol(chr)) { + // Return to default parsing. + complete = true; + } else if (isalpha(chr) || isdigit(chr) || (chr == '_')) { + buffer << (char)chr; + _stream.get(); + _location.second++; + } else { + token.text = buffer.str(); + throw blitz::error(_file, token.location, _location, blitz::format("In token %s: Expected [a-zA-Z0-9_], got '%s' instead.", token.to_string().c_str(), std::string{ 1, (char)chr }.c_str())); + } - if (complete) { - token.text = buffer.str(); - } - } else if (state == stage::STRING) { - if ((chr == EOF) || (chr < 32) || is_newline) { - // Return to default parsing. - complete = true; - } else if (chr == '"') { // The only true way to end a string. - complete = true; + if (complete) { + token.text = buffer.str(); + } + } else if (state == stage::STRING) { + if ((chr == EOF) || (chr < 32) || is_newline) { + // Return to default parsing. + complete = true; + } else if (chr == '"') { // The only true way to end a string. + complete = true; - // Skip over the " so we don't confuse the parser. - _stream.get(); - _location.second++; - } else { - buffer << (char)chr; - _stream.get(); - _location.second++; - } + // Skip over the " so we don't confuse the parser. + _stream.get(); + _location.second++; + } else { + buffer << (char)chr; + _stream.get(); + _location.second++; + } - if (complete) { - token.text = buffer.str(); - } - } else if (state == stage::COMMENT) { - if ((chr == EOF) || (chr < 32) || is_newline) { - // Return to default parsing at this point. - complete = true; - } else { - buffer << (char)chr; - _stream.get(); - _location.second++; - } + if (complete) { + token.text = buffer.str(); + } + } else if (state == stage::COMMENT) { + if ((chr == EOF) || (chr < 32) || is_newline) { + // Return to default parsing at this point. + complete = true; + } else { + buffer << (char)chr; + _stream.get(); + _location.second++; + } - if (complete) { - token.text = buffer.str(); + if (complete) { + token.text = buffer.str(); + } } } + _next = token; } - return token; + return _next; +} + +std::filesystem::path blitz::lexer::file() +{ + return std::filesystem::path(_file); } diff --git a/code_compiler/source/lexer.hpp b/code_compiler/source/lexer.hpp index 25990ee..def21f4 100644 --- a/code_compiler/source/lexer.hpp +++ b/code_compiler/source/lexer.hpp @@ -1,5 +1,5 @@ /// AUTOGENERATED COPYRIGHT HEADER START -// Copyright (C) 2017-2024 Michael Fabian 'Xaymar' Dirks +// Copyright (C) 2017-2025 Michael Fabian 'Xaymar' Dirks // AUTOGENERATED COPYRIGHT HEADER END #pragma once #include @@ -21,6 +21,7 @@ namespace blitz { std::pair location; std::string text; enum class variant : uint64_t { + NONE, // There is no token here. UNKNOWN, // We have absolutely no fucking clue. ENDOFFILE, // End of the file. NEWLINE, // New Line. @@ -48,7 +49,7 @@ namespace blitz { std::pair _location; blitz::token _current; - blitz::token _override; + blitz::token _next; public: ~lexer(); @@ -69,5 +70,8 @@ namespace blitz { * The current token will remain in-tact. */ blitz::token peek(); + + public: + std::filesystem::path file(); }; } // namespace blitz diff --git a/code_compiler/source/main.cpp b/code_compiler/source/main.cpp index 1425ba2..accc613 100644 --- a/code_compiler/source/main.cpp +++ b/code_compiler/source/main.cpp @@ -1,20 +1,21 @@ // AUTOGENERATED COPYRIGHT HEADER START -// Copyright (C) 2017-2024 Michael Fabian 'Xaymar' Dirks +// Copyright (C) 2017-2025 Michael Fabian 'Xaymar' Dirks // AUTOGENERATED COPYRIGHT HEADER END #include #include #include "compiler.hpp" #include "error.hpp" #include "lexer.hpp" +#include "parser.hpp" int main(int argc, char** argv) { - std::setlocale(LC_ALL, "en_US.UTF-8"); - - std::cout << argv[1] << std::endl; - blitz::lexer lex(argv[1]); - try { + std::setlocale(LC_ALL, "en_US.UTF-8"); + + std::cout << argv[1] << std::endl; + + blitz::lexer lex(argv[1]); for (blitz::token token = lex.next(); (token.type != blitz::token::variant::ENDOFFILE); token = lex.next()) { switch (token.type) { case blitz::token::variant::COMMENT: @@ -44,15 +45,19 @@ int main(int argc, char** argv) std::cin.get(); } } + + blitz::parser pars(argv[1]); + + //std::cin.get(); + return 0; } catch (blitz::error const& ex) { std::cout << ex.file() << std::endl; std::cout << "Line " << ex.at().first << ", Char " << ex.at().second << ": " << ex.what() << std::endl; + return 1; } catch (std::runtime_error const& ex) { std::cout << ex.what() << std::endl; + return 1; } - - //std::cin.get(); - return 0; } // BlitzBasic is a strange but powerful language in the right hands. While it has @@ -79,6 +84,7 @@ int main(int argc, char** argv) // // 3. Function calls don't always need Parenthesis: // ``` +// Local myName // Function myName() : End Function // If myName() Then : EndIf ; <- Calls myName // myName ; <- Calls myName, because there is no = after it. diff --git a/code_compiler/source/parser.cpp b/code_compiler/source/parser.cpp index a3edb07..eb3713c 100644 --- a/code_compiler/source/parser.cpp +++ b/code_compiler/source/parser.cpp @@ -1,5 +1,5 @@ /// AUTOGENERATED COPYRIGHT HEADER START -// Copyright (C) 2024 Michael Fabian 'Xaymar' Dirks +// Copyright (C) 2024-2025 Michael Fabian 'Xaymar' Dirks // AUTOGENERATED COPYRIGHT HEADER END #include "parser.hpp" #include @@ -12,101 +12,150 @@ blitz::parser::~parser() {} blitz::parser::parser(std::filesystem::path file) : _file(file), _lexer(), _expr() { _lexer = std::make_shared(file); + _lexer->next(); + blitz::ast::variable::try_parse(_lexer); } -std::shared_ptr blitz::parser::current() -{ - return _expr; -} -std::shared_ptr blitz::parser::next() -{ - // This should return an entire "line" of expressions in one go, i.e.: - // 1. Local a = 1, b = a, c = b+a - // -> Local(Variable(a, Expression(Integer(1))), Variable(b, Expresssion(Variable(a))), Variable(c, Expression(Add(Variable(b), Variable(a))) - // 2. Include "HelloWorld.bb" - // -> Include(String("HelloWorld.bb")) - // 3. Function HelloWorld() - // -> Function(HelloWorld, ...)( - // Not quite sure if the above makes sense, we'd be returning many expressions outside of functions, but only one inside a function? Why even bother with the current/next crap then? - // Handling Include becomes a problem too. I guess we should actually return expressions on a line by line basis, and let the "compiler" figure out scope and stuff. +// +//std::shared_ptr blitz::parser::current() +//{ +// return _expr; +//} +// +//std::shared_ptr blitz::parser::next() +//{ +// // This should return an entire "line" of expressions in one go, i.e.: +// // 1. Local a = 1, b = a, c = b+a +// // -> Local(Variable(a, Expression(Integer(1))), Variable(b, Expresssion(Variable(a))), Variable(c, Expression(Add(Variable(b), Variable(a))) +// // 2. Include "HelloWorld.bb" +// // -> Include(String("HelloWorld.bb")) +// // 3. Function HelloWorld() +// // -> Function(HelloWorld, ...)( +// // Not quite sure if the above makes sense, we'd be returning many expressions outside of functions, but only one inside a function? Why even bother with the current/next crap then? +// // Handling Include becomes a problem too. I guess we should actually return expressions on a line by line basis, and let the "compiler" figure out scope and stuff. +// +// // Grab the next token to figure out what behavior we should have. +// while (true) { +// auto token = _lexer->next(); +// try { +// switch (token.type) { +// case blitz::token::variant::ENDOFFILE: +// // End of file means there's nothing left to parse. +// _expr.reset(); +// return nullptr; +// case blitz::token::variant::COMMENT: +// case blitz::token::variant::NEWLINE: +// case blitz::token::variant::SEPARATOR: +// // Ignore some things that aren't very useful right now. +// continue; +// case blitz::token::variant::TEXT: +// return try_parse(token); +// default: +// throw nullptr; +// } +// } catch (blitz::error const& ex) { +// throw ex; +// } catch (std::exception const& ex) { +// throw new blitz::error(_file, token.location, token.location, ex.what()); +// } catch (...) { +// throw new blitz::error(_file, token.location, token.location, blitz::format("Token %s unexpected at this point.", token.to_string().c_str())); +// } +// } +//} +// +//std::shared_ptr blitz::parser::try_parse(blitz::token token) +//{ +// // ToDo: Switch to a proper Unicode library. Maybe Boost? +// std::string ltext; +// std::transform(token.text.begin(), token.text.end(), ltext.begin(), [](std::string::value_type c) { return std::tolower(c); }); +// +// if ((ltext == "local") || (ltext == "global")) { +// // Local/Global have the same parsing, but different functionality. +// // Should be: +// // Text Text [Symbol(=) Expression] [Symbol(,) Text [Symbol(=) Expression] [Symbol(,) ...]] +// +// } else if (ltext == "function") { +// //Example: +// // Function FunctionName[$,%,#,:TypeName,.StructName]([Variable[, Variable=Value[, ...]]) +// // [Function Content ...] +// // EndFunction +// +// +// } else if (ltext == "select") { +// } else if (ltext == "case") { +// } else if (ltext == "endselect") { +// } else if (ltext == "if") { +// } else if (ltext == "elif") { +// } else if (ltext == "endif") { +// +// } else if (ltext == "end") { +// } +// +// return nullptr; +//} +// +//std::shared_ptr blitz::parser::try_parse_expression() { +// // () + - / * = <> > < String Integer Float Variable +//} +// +//std::shared_ptr blitz::parser::try_parse_variable_expression() +//{ +// // Text [Symbol(=) Expression(...)] [Symbol(,) [Text [Symbol(=) Expression(...)]]] +// +// auto label = _lexer->next(); +// if (label != blitz::token::variant::TEXT) { +// throw new blitz::error(_file, label.location, label.location, blitz::format("Unexpected %s, expected Text.", label.to_string().c_str())); +// } +// +// auto node = std::make_shared(label); +// +// auto operand = _lexer->next(); +// if (operand == "=") { +// //node->set_value(try_parse_expression()); +// } else if (operand == blitz::token::variant::NEWLINE || operand == blitz::token::variant::SEPARATOR || (operand == blitz::token::variant::SYMBOL && operand == ",")) { +// return node; +// } else { +// throw new blitz::error(_file, label.location, operand.location, blitz::format("Unexpected %s, expected Symbol(=), NewLine, Separator, or Symbol(,).", operand.to_string().c_str())); +// } +// +// return node; +//} - // Grab the next token to figure out what behavior we should have. - while (true) { - auto token = _lexer->next(); - try { - switch (token.type) { - case blitz::token::variant::ENDOFFILE: - // End of file means there's nothing left to parse. - _expr.reset(); - return nullptr; - case blitz::token::variant::COMMENT: - case blitz::token::variant::NEWLINE: - case blitz::token::variant::SEPARATOR: - // Ignore some things that aren't very useful right now. - continue; - case blitz::token::variant::TEXT: - return try_parse(token); - default: - throw nullptr; - } - } catch (blitz::error const& ex) { - throw ex; - } catch (std::exception const& ex) { - throw new blitz::error(_file, token.location, token.location, ex.what()); - } catch (...) { - throw new blitz::error(_file, token.location, token.location, blitz::format("Token %s unexpected at this point.", token.to_string().c_str())); - } - } -} -std::shared_ptr blitz::parser::try_parse(blitz::token token) -{ - // ToDo: Switch to a proper Unicode library. Maybe Boost? - std::string ltext; - std::transform(token.text.begin(), token.text.end(), ltext.begin(), [](std::string::value_type c) { return std::tolower(c); }); - - if ((ltext == "local") || (ltext == "global")) { - // Local/Global have the same parsing, but different functionality. - // Should be: - // Text Text [Symbol(=) Expression] [Symbol(,) Text [Symbol(=) Expression] [Symbol(,) ...]] - - } else if (ltext == "global") { - // Global ... - - } else if (ltext == "function") { - } else if (ltext == "select") { - } else if (ltext == "case") { - } else if (ltext == "endselect") { - } else if (ltext == "if") { - } else if (ltext == "elif") { - } else if (ltext == "endif") { - - } else if (ltext == "end") { - } - - return nullptr; -} - -std::shared_ptr blitz::parser::try_parse_variable() -{ - // Text [Symbol(=) Expression(...)] [Symbol(,) [Text [Symbol(=) Expression(...)]]] - - auto label = _lexer->next(); - if (label != blitz::token::variant::TEXT) { - throw new blitz::error(_file, label.location, label.location, blitz::format("Unexpected %s, expected Text.", label.to_string().c_str())); - } - - auto node = std::make_shared(label); - - auto operand = _lexer->next(); - if (operand == "=") { - //node->set_value(try_parse_expression()); - } else if (operand == blitz::token::variant::NEWLINE || operand == blitz::token::variant::SEPARATOR || (operand == blitz::token::variant::SYMBOL && operand == ",")) { - return node; - } else { - throw new blitz::error(_file, label.location, operand.location, blitz::format("Unexpected %s, expected Symbol(=), NewLine, Separator, or Symbol(,).", operand.to_string().c_str())); - } - - return node; -} +/* Expressions + * + * Example Locations: + * - Local Var = Expression + * - Var = Expression + * - myFunction(Expression, ...) + * - If Expression Then + * + * Example Expressions: + * - 0 + 0, 0 - 0, 0 * 0, 0 / 0, 0 Shr 0, 0 Shl 0, 0 And 0, 0 Or 0, Not 0, + * - + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + */ diff --git a/code_compiler/source/parser.hpp b/code_compiler/source/parser.hpp index 5fa66da..fcb2779 100644 --- a/code_compiler/source/parser.hpp +++ b/code_compiler/source/parser.hpp @@ -1,5 +1,5 @@ /// AUTOGENERATED COPYRIGHT HEADER START -// Copyright (C) 2017-2024 Michael Fabian 'Xaymar' Dirks +// Copyright (C) 2017-2025 Michael Fabian 'Xaymar' Dirks // AUTOGENERATED COPYRIGHT HEADER END #pragma once #include diff --git a/code_compiler/source/types.cpp b/code_compiler/source/types.cpp new file mode 100644 index 0000000..bcb87c2 --- /dev/null +++ b/code_compiler/source/types.cpp @@ -0,0 +1,70 @@ +// AUTOGENERATED COPYRIGHT HEADER START +// Copyright (C) 2025 Michael Fabian 'Xaymar' Dirks +// AUTOGENERATED COPYRIGHT HEADER END +#include "types.hpp" +#include +#include + +const std::pair _map_to[] = { + { "byte", blitz::types::type::INT8 }, // + { "int8", blitz::types::type::INT8 }, // + { "ubyte", blitz::types::type::UINT8 }, // + { "uint8", blitz::types::type::UINT8 }, // + { "short", blitz::types::type::INT16 }, // + { "int16", blitz::types::type::INT16 }, // + { "ushort", blitz::types::type::UINT16 }, // + { "uint16", blitz::types::type::UINT16 }, // + { "int", blitz::types::type::INT32 }, // + { "int32", blitz::types::type::INT32 }, // + { "uint", blitz::types::type::UINT32 }, // + { "uint32", blitz::types::type::UINT32 }, // + { "long ", blitz::types::type::INT64 }, // + { "int64", blitz::types::type::INT64 }, // + { "ulong", blitz::types::type::UINT64 }, // + { "uint64", blitz::types::type::UINT64 }, // + { "half", blitz::types::type::FLOAT16 }, // + { "float16", blitz::types::type::FLOAT16 }, // + { "real16", blitz::types::type::FLOAT16 }, // + { "single", blitz::types::type::FLOAT32 }, // + { "float", blitz::types::type::FLOAT32 }, // + { "float32", blitz::types::type::FLOAT32 }, // + { "real", blitz::types::type::FLOAT32 }, // + { "real32", blitz::types::type::FLOAT32 }, // + { "double", blitz::types::type::DOUBLE }, // + { "float64", blitz::types::type::DOUBLE }, // + { "real64", blitz::types::type::DOUBLE }, // + { "string", blitz::types::type::STRING }, // +}; + +std::string blitz::types::to_string(blitz::types::type type) +{ + if (type == type::STRUCT) { + return "struct"; + } + + for (auto kv : _map_to) { + if (type == kv.second) { + return kv.first; + } + } + + return "Unknown"; +} + +blitz::types::type blitz::types::from_string(std::string text) +{ + std::transform(text.cbegin(), text.cend(), text.begin(), [](char from) { + if (from & 0b10000000) { // Exclude Unicode + return from; + } + return (char)std::tolower(from); + }); + + for (auto kv : _map_to) { + if (text == kv.first) { + return kv.second; + } + } + + return blitz::types::type::UNKNOWN; +} diff --git a/code_compiler/source/types.hpp b/code_compiler/source/types.hpp new file mode 100644 index 0000000..e683b47 --- /dev/null +++ b/code_compiler/source/types.hpp @@ -0,0 +1,52 @@ + +// AUTOGENERATED COPYRIGHT HEADER START +// Copyright (C) 2025 Michael Fabian 'Xaymar' Dirks +// AUTOGENERATED COPYRIGHT HEADER END +#pragma once +#include +#include + +namespace blitz { + namespace types { + enum class type : uint8_t { + UNKNOWN, + // 8-bit Integers + INT8, + BYTE = INT8, + UINT8, + UBYTE = UINT8, + // 16-bit Integers + INT16, + SHORT = INT16, + UINT16, + USHORT = UINT16, + // 32-bit Integers + INT32, + INT = INT32, + UINT32, + UINT = UINT32, + // 64-bit Integers + INT64, + LONG = INT64, + UINT64, + ULONG = UINT64, + // 16-bit Float + FLOAT16, + HALF = FLOAT16, + // 32-bit Float + FLOAT32, + FLOAT = FLOAT32, + SINGLE = FLOAT32, + // 64-bit Float + FLOAT64, + DOUBLE = FLOAT64, + // UTF-8 String + STRING, + // User-defined Struct + STRUCT, + }; + + std::string to_string(blitz::types::type type); + blitz::types::type from_string(std::string text); + }; +} // namespace blitz diff --git a/code_compiler/source/util.cpp b/code_compiler/source/util.cpp new file mode 100644 index 0000000..375c958 --- /dev/null +++ b/code_compiler/source/util.cpp @@ -0,0 +1,59 @@ +// AUTOGENERATED COPYRIGHT HEADER START +// Copyright (C) 2025 Michael Fabian 'Xaymar' Dirks +// AUTOGENERATED COPYRIGHT HEADER END +#include "util.hpp" +#include + +bool blitz::utility::is_symbol(int code) +{ + switch (chr) { + case ';': // Comment + case ':': // Command Separator + case '=': // Equal + case '<': // Less Than + case '>': // Greater Than + case '~': // Bitwise Not + case '^': // Exponential (X ^ Y = pow(X, Y)) + case '+': // Plus + case '-': // Minus + case '*': // Multiply + case '/': // Divide + case ',': // Parameter Separation + case '%': // Integer Type + case '#': // Real Type + case '$': // String Type + case '.': // Structured Type + case '\\': // Structured Type Access + + case '[': // Blitz Arrays + case ']': + + case '(': // Call, Grouping, Dim + case ')': + return true; + default: + return false; + } + return false; +} + +bool blitz::utility::is_white_space(int code) +{ + switch (chr) { + case ' ': + case '\t': + return true; + default: + return false; + } + return false; +} + +bool blitz::utility::is_digit(int code) +{ + return isdigit(code); +} + +bool blitz::utility::is_alpha(int code) { + return isalpha(code); +} diff --git a/code_compiler/source/util.hpp b/code_compiler/source/util.hpp new file mode 100644 index 0000000..0d3ed23 --- /dev/null +++ b/code_compiler/source/util.hpp @@ -0,0 +1,13 @@ +// AUTOGENERATED COPYRIGHT HEADER START +// Copyright (C) 2025 Michael Fabian 'Xaymar' Dirks +// AUTOGENERATED COPYRIGHT HEADER END + +namespace blitz::utility { + bool is_symbol(int code); + + bool is_white_space(int code); + + bool is_digit(int code); + + bool is_alpha(int code); +} // namespace blitz::utility diff --git a/tests/test.bb b/tests/test.bb new file mode 100644 index 0000000..effe272 --- /dev/null +++ b/tests/test.bb @@ -0,0 +1,4 @@ +; AUTOGENERATED COPYRIGHT HEADER START +; Copyright (C) 2025 Michael Fabian 'Xaymar' Dirks +; AUTOGENERATED COPYRIGHT HEADER END +Variable:String