Files
BlitzLLVM/projects/code_compiler/source/lexer.cpp
T
Michael Fabian 'Xaymar' Dirks d5945e8c39 code_compiler: Implement Lexer for Blitz
The Lexer should now be able to parse all existing Blitz code into easily identified Tokens. With this information, a Parser can now verify that the syntax was kept correctly, warn about possible dangerous or wrong use and convert the Tokens into actual LLVM code for compiling.
2017-11-13 02:14:57 +01:00

172 lines
4.8 KiB
C++

// Code Compiler for BlitzLLVM
// Copyright(C) 2017 Michael Fabian Dirks
//
// This program is free software : you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program.If not, see <https://www.gnu.org/licenses/>.
#include "lexer.hpp"
#include <codecvt>
char g_whitespaceCharacters[] = {
' ',
'\t',
};
std::pair<char, BlitzLLVM::Lexer::Token> g_symbolCharacters[] = {
//{ '\"', BlitzLLVM::Lexer::Token::TokenDoubleQuote }, // Has special meaning.
{ '+', BlitzLLVM::Lexer::Token::TokenPlus },
{ '-', BlitzLLVM::Lexer::Token::TokenMinus },
{ '/', BlitzLLVM::Lexer::Token::TokenSlashForward },
{ '\\', BlitzLLVM::Lexer::Token::TokenSlashBackward },
{ '*', BlitzLLVM::Lexer::Token::TokenMultiply },
{ '=', BlitzLLVM::Lexer::Token::TokenEqual },
{ '#', BlitzLLVM::Lexer::Token::TokenOctothorp },
{ '%', BlitzLLVM::Lexer::Token::TokenPercent },
{ '$', BlitzLLVM::Lexer::Token::TokenDollar },
{ '(', BlitzLLVM::Lexer::Token::TokenRoundBracketOpen },
{ ')', BlitzLLVM::Lexer::Token::TokenRoundBracketClose },
{ '[', BlitzLLVM::Lexer::Token::TokenSquareBracketOpen },
{ ']', BlitzLLVM::Lexer::Token::TokenSquareBracketClose },
{ '<', BlitzLLVM::Lexer::Token::TokenAngleBracketOpen },
{ '>', BlitzLLVM::Lexer::Token::TokenAngleBracketClose },
//{ '.', BlitzLLVM::Lexer::Token::TokenDot }, // Special meaning.
{ ':', BlitzLLVM::Lexer::Token::TokenColon },
{ ',', BlitzLLVM::Lexer::Token::TokenComma },
{ ';', BlitzLLVM::Lexer::Token::TokenSemicolon },
{ '^', BlitzLLVM::Lexer::Token::TokenCaret },
{ '~', BlitzLLVM::Lexer::Token::TokenBitNot },
};
BlitzLLVM::Lexer::Lexer(std::istream& fs) : m_fileStream(fs) {}
BlitzLLVM::Lexer::~Lexer() {}
std::pair<BlitzLLVM::Lexer::Token, std::string> BlitzLLVM::Lexer::GetNextToken() {
std::string buf;
Token tkn = Token::TokenEOF;
bool haveResult = false;
// Allow "overriding" the next retrieved Token.
if (m_overrideToken != Token::TokenUnknown) {
buf = m_overrideText;
tkn = m_overrideToken;
m_overrideToken = Token::TokenUnknown;
haveResult = true;
}
while (((m_fileStream.eof() == false) && (m_fileStream.good())) && !haveResult) {
char chr = m_fileStream.get();
if (chr == '\r' || chr == '\n') {
if (tkn != Token::TokenEOF) {
m_overrideToken = Token::TokenNewLine;
m_overrideText = "";
} else {
tkn = Token::TokenNewLine;
buf = "";
}
m_isStringMode = false;
m_isNumberMode = false;
m_isTextMode = false;
break;
} else if (!m_isStringMode && !m_isTextMode && !m_isNumberMode) {
// Whitespace
bool isWhitespace = false;
for (char v : g_whitespaceCharacters) {
if (v == chr) {
isWhitespace = true;
break;
}
}
if (isWhitespace)
continue;
// Symbol
for (auto v : g_symbolCharacters) {
if (v.first == chr) {
tkn = v.second;
buf = v.first;
break;
}
}
if (tkn != Token::TokenEOF) {
haveResult = true;
break;
}
// Strings, Text, Numbers
if (chr == '\"') {
m_isStringMode = true;
tkn = Token::TokenDoubleQuote;
buf = chr;
break;
} else if (isalpha(chr)) {
m_isTextMode = true;
tkn = Token::TokenText;
buf = chr;
} else if (isdigit(chr)) {
m_isNumberMode = true;
m_numberModeHasDecimal = false;
tkn = Token::TokenNumber;
buf = chr;
} else if (chr == '.') {
m_isNumberMode = true;
m_numberModeHasDecimal = true;
tkn = Token::TokenDecimal;
buf = "0" + chr;
} else {
tkn = Token::TokenUnknown;
buf = chr;
break;
}
} else if (m_isStringMode) {
if (chr == '\"') {
m_overrideToken = Token::TokenDoubleQuote;
m_overrideText = chr;
m_isStringMode = false;
tkn = Token::TokenQuotedText;
break;
} else {
buf += chr;
}
} else if (m_isTextMode) {
if (isalnum(chr) || (chr == '_')) {
buf += chr;
} else {
m_fileStream.putback(chr);
m_isTextMode = false;
break;
}
} else if (m_isNumberMode) {
if (isdigit(chr)) {
buf += chr;
} else if (chr == '.') {
if (m_numberModeHasDecimal == false) {
m_numberModeHasDecimal = true;
tkn = Token::TokenDecimal;
buf += chr;
} else {
throw std::runtime_error("Unexpected '.' while parsing number.");
}
} else {
m_fileStream.putback(chr);
m_isNumberMode = false;
break;
}
}
}
return std::make_pair(tkn, buf);
}