code_compiler: Implement Lexer for Blitz

The Lexer should now be able to parse all existing Blitz code into easily identified Tokens. With this information, a Parser can now verify that the syntax was kept correctly, warn about possible dangerous or wrong use and convert the Tokens into actual LLVM code for compiling.
2017-11-13 02:14:57 +01:00
parent 89b98c6925
commit d5945e8c39
2 changed files with 297 additions and 0 deletions
@@ -0,0 +1,171 @@
+//	Code Compiler for BlitzLLVM
+//	Copyright(C) 2017 Michael Fabian Dirks
+//
+//	This program is free software : you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation, either version 3 of the License, or
+//	(at your option) any later version.
+//
+//	This program is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with this program.If not, see <https://www.gnu.org/licenses/>.
+
+#include "lexer.hpp"
+#include <codecvt>
+
+char g_whitespaceCharacters[] = {
+	' ',
+	'\t',
+};
+
+std::pair<char, BlitzLLVM::Lexer::Token> g_symbolCharacters[] = {
+	//{ '\"', BlitzLLVM::Lexer::Token::TokenDoubleQuote }, // Has special meaning.
+	{ '+', BlitzLLVM::Lexer::Token::TokenPlus },
+	{ '-', BlitzLLVM::Lexer::Token::TokenMinus },
+	{ '/', BlitzLLVM::Lexer::Token::TokenSlashForward },
+	{ '\\', BlitzLLVM::Lexer::Token::TokenSlashBackward },
+	{ '*', BlitzLLVM::Lexer::Token::TokenMultiply },
+	{ '=', BlitzLLVM::Lexer::Token::TokenEqual },
+	{ '#', BlitzLLVM::Lexer::Token::TokenOctothorp },
+	{ '%', BlitzLLVM::Lexer::Token::TokenPercent },
+	{ '$', BlitzLLVM::Lexer::Token::TokenDollar },
+	{ '(', BlitzLLVM::Lexer::Token::TokenRoundBracketOpen },
+	{ ')', BlitzLLVM::Lexer::Token::TokenRoundBracketClose },
+	{ '[', BlitzLLVM::Lexer::Token::TokenSquareBracketOpen },
+	{ ']', BlitzLLVM::Lexer::Token::TokenSquareBracketClose },
+	{ '<', BlitzLLVM::Lexer::Token::TokenAngleBracketOpen },
+	{ '>', BlitzLLVM::Lexer::Token::TokenAngleBracketClose },
+	//{ '.', BlitzLLVM::Lexer::Token::TokenDot }, // Special meaning.
+	{ ':', BlitzLLVM::Lexer::Token::TokenColon },
+	{ ',', BlitzLLVM::Lexer::Token::TokenComma },
+	{ ';', BlitzLLVM::Lexer::Token::TokenSemicolon },
+	{ '^', BlitzLLVM::Lexer::Token::TokenCaret },
+	{ '~', BlitzLLVM::Lexer::Token::TokenBitNot },
+};
+
+BlitzLLVM::Lexer::Lexer(std::istream& fs) : m_fileStream(fs) {}
+
+BlitzLLVM::Lexer::~Lexer() {}
+
+std::pair<BlitzLLVM::Lexer::Token, std::string> BlitzLLVM::Lexer::GetNextToken() {
+	std::string buf;
+	Token tkn = Token::TokenEOF;
+	bool haveResult = false;
+
+	// Allow "overriding" the next retrieved Token.
+	if (m_overrideToken != Token::TokenUnknown) {
+		buf = m_overrideText;
+		tkn = m_overrideToken;
+		m_overrideToken = Token::TokenUnknown;
+		haveResult = true;
+	}
+
+	while (((m_fileStream.eof() == false) && (m_fileStream.good())) && !haveResult) {
+		char chr = m_fileStream.get();
+
+		if (chr == '\r' || chr == '\n') {
+			if (tkn != Token::TokenEOF) {
+				m_overrideToken = Token::TokenNewLine;
+				m_overrideText = "";
+			} else {
+				tkn = Token::TokenNewLine;
+				buf = "";
+			}
+
+			m_isStringMode = false;
+			m_isNumberMode = false;
+			m_isTextMode = false;
+			break;
+		} else if (!m_isStringMode && !m_isTextMode && !m_isNumberMode) {
+			// Whitespace
+			bool isWhitespace = false;
+			for (char v : g_whitespaceCharacters) {
+				if (v == chr) {
+					isWhitespace = true;
+					break;
+				}
+			}
+			if (isWhitespace)
+				continue;
+
+			// Symbol
+			for (auto v : g_symbolCharacters) {
+				if (v.first == chr) {
+					tkn = v.second;
+					buf = v.first;
+					break;
+				}
+			}
+			if (tkn != Token::TokenEOF) {
+				haveResult = true;
+				break;
+			}
+			
+			// Strings, Text, Numbers
+			if (chr == '\"') {
+				m_isStringMode = true;
+				tkn = Token::TokenDoubleQuote;
+				buf = chr;
+				break;
+			} else if (isalpha(chr)) {
+				m_isTextMode = true;
+				tkn = Token::TokenText;
+				buf = chr;
+			} else if (isdigit(chr)) {
+				m_isNumberMode = true;
+				m_numberModeHasDecimal = false;
+				tkn = Token::TokenNumber;
+				buf = chr;
+			} else if (chr == '.') {
+				m_isNumberMode = true;
+				m_numberModeHasDecimal = true;
+				tkn = Token::TokenDecimal;
+				buf = "0" + chr;
+			} else {
+				tkn = Token::TokenUnknown;
+				buf = chr;
+				break;
+			}
+		} else if (m_isStringMode) {
+			if (chr == '\"') {
+				m_overrideToken = Token::TokenDoubleQuote;
+				m_overrideText = chr;
+				m_isStringMode = false;
+				tkn = Token::TokenQuotedText;
+				break;
+			} else {
+				buf += chr;
+			}
+		} else if (m_isTextMode) {
+			if (isalnum(chr) || (chr == '_')) {
+				buf += chr;
+			} else {
+				m_fileStream.putback(chr);
+				m_isTextMode = false;
+				break;
+			}
+		} else if (m_isNumberMode) {
+			if (isdigit(chr)) {
+				buf += chr;
+			} else if (chr == '.') {
+				if (m_numberModeHasDecimal == false) {
+					m_numberModeHasDecimal = true;
+					tkn = Token::TokenDecimal;
+					buf += chr;
+				} else {
+					throw std::runtime_error("Unexpected '.' while parsing number.");
+				}
+			} else {
+				m_fileStream.putback(chr);
+				m_isNumberMode = false;
+				break;				
+			}
+		}
+	}
+
+	return std::make_pair(tkn, buf);
+}
@@ -0,0 +1,126 @@
+//	Code Compiler for BlitzLLVM
+//	Copyright(C) 2017 Michael Fabian Dirks
+//
+//	This program is free software : you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation, either version 3 of the License, or
+//	(at your option) any later version.
+//
+//	This program is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with this program.If not, see <https://www.gnu.org/licenses/>.
+
+#pragma once
+#include <list>
+#include <istream>
+#include <string>
+#include <inttypes.h>
+
+namespace BlitzLLVM {	
+	class Lexer {
+		public:
+		enum class Token : uint64_t {
+			TokenUnknown,
+			TokenEOF,
+			TokenNewLine,
+
+			// Symbols
+			TokenPlus,
+			TokenMinus,
+			TokenSlashForward,
+			TokenSlashBackward,
+			TokenMultiply,
+			TokenEqual,
+			TokenOctothorp,
+			TokenPercent,
+			TokenDollar,
+			TokenRoundBracketOpen,
+			TokenRoundBracketClose,
+			TokenSquareBracketOpen,
+			TokenSquareBracketClose,
+			TokenAngleBracketOpen,
+			TokenAngleBracketClose,
+			TokenDot,
+			TokenColon,
+			TokenComma,
+			TokenSemicolon,
+			TokenCaret,
+			TokenBitNot /*~*/,
+
+			// String Delimiter
+			TokenDoubleQuote,
+
+			// Types
+			TokenText,
+			TokenNumber,
+			TokenDecimal,
+			TokenQuotedText, // Text encapsulated by TokenDoubleQuote
+
+			// Binary
+			TokenNot,
+			TokenAnd, TokenOr, TokenXor,
+			TokenShl, TokenShr,
+			TokenSar, TokenSal,
+			TokenFalse, TokenTrue,
+
+			// Conversion
+			TokenFloat,
+			TokenString, TokenHex,
+			TokenInt,
+
+			// Control
+			TokenIf, TokenThen, TokenElseIf, TokenElse, TokenEndIf,
+			TokenSelect, TokenCase, TokenDefault, // End Select = TokenEnd, TokenSelect.
+			TokenGoto, TokenGosub,
+			TokenReturn,
+			TokenFunction, // End Function = TokenEnd, TokenFunction.
+			TokenEnd,
+			TokenStop /* DEBUGGER! Ignore in Release mode. */,
+
+			// Loop
+			TokenFor, TokenTo, TokenNext,
+			TokenWhile, TokenWend,
+			TokenRepeat, TokenUntil, TokenForever,
+			TokenExit,
+
+			// Math
+			TokenAbs, TokenSign /*Sgn*/,
+			TokenCos, TokenSin, TokenTan,
+			TokenACos, TokenASin, TokenATan, TokenATan2,
+			TokenLog, TokenLog10,
+			TokenCeil, TokenFloor,
+			TokenMod,
+			TokenPi,
+			TokenExp, TokenSqr,
+
+			// Variables
+			TokenConst,
+			TokenGlobal,
+			TokenLocal,
+
+			// Including files.
+			TokenInclude,			
+		};
+
+		public:
+		Lexer(std::istream& fs);
+		~Lexer();
+
+		std::pair<Token, std::string> GetNextToken();
+		
+		private:
+		std::istream& m_fileStream;
+
+		bool m_isTextMode = false;
+		bool m_isNumberMode = false;
+		bool m_isStringMode = false;
+		bool m_numberModeHasDecimal = false;
+
+		Token m_overrideToken = Token::TokenUnknown;
+		std::string m_overrideText = "";
+	};
+}