Initial tokenizer implementation
This commit is contained in:
parent
f6ac3e60a9
commit
66bdd5f917
7 changed files with 276 additions and 1 deletions
59
src/compiler/token.rs
Normal file
59
src/compiler/token.rs
Normal file
|
@ -0,0 +1,59 @@
|
|||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CodeLocation {
|
||||
row: i32,
|
||||
col: i32,
|
||||
}
|
||||
|
||||
impl CodeLocation {
|
||||
pub fn new(row: i32, col: i32) -> Self {
|
||||
Self { row, col }
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for CodeLocation {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
let true_match = self.row == other.row && self.col == other.col;
|
||||
|
||||
// For testing purposes
|
||||
let simulated_match = self.row < 0 || self.col < 0 || other.row < 0 || other.col < 0;
|
||||
|
||||
true_match || simulated_match
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Clone, Copy)]
|
||||
pub enum TokenType {
|
||||
Comment,
|
||||
Integer,
|
||||
Identifier,
|
||||
Operator,
|
||||
Punctuation,
|
||||
Whitespace,
|
||||
}
|
||||
|
||||
impl TokenType {
|
||||
pub fn ignore(&self) -> bool {
|
||||
use TokenType::*;
|
||||
match self {
|
||||
Whitespace | Comment => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct Token {
|
||||
text: String,
|
||||
token_type: TokenType,
|
||||
loc: CodeLocation,
|
||||
}
|
||||
|
||||
impl Token {
|
||||
pub fn new(text: &str, token_type: TokenType, loc: CodeLocation) -> Self {
|
||||
Self {
|
||||
text: text.to_string(),
|
||||
token_type,
|
||||
loc,
|
||||
}
|
||||
}
|
||||
}
|
161
src/compiler/tokenizer.rs
Normal file
161
src/compiler/tokenizer.rs
Normal file
|
@ -0,0 +1,161 @@
|
|||
use crate::compiler::token::{CodeLocation, Token, TokenType};
|
||||
use regex::Regex;
|
||||
|
||||
pub fn tokenize(code: &str) -> Vec<Token> {
|
||||
// We only want to compile the regexes once
|
||||
// The ordering of these is important!
|
||||
let regexes = vec![
|
||||
(TokenType::Comment, Regex::new(r"^(\\\\|#).*\n").unwrap()),
|
||||
(TokenType::Whitespace, Regex::new(r"^[\s\t\n]+").unwrap()),
|
||||
(
|
||||
TokenType::Operator,
|
||||
Regex::new(r"^(==|!=|<=|>=|=|<|>|\+|-|\*|/)").unwrap(),
|
||||
),
|
||||
(TokenType::Punctuation, Regex::new(r"^[\(\){},;]").unwrap()),
|
||||
(TokenType::Integer, Regex::new(r"^[0-9]+").unwrap()),
|
||||
(
|
||||
TokenType::Identifier,
|
||||
Regex::new(r"^[[:alpha:]_][[:alpha:]0-9_]*").unwrap(),
|
||||
),
|
||||
];
|
||||
|
||||
let mut tokens = Vec::new();
|
||||
|
||||
let mut pos = 0;
|
||||
|
||||
while pos < code.len() {
|
||||
let mut valid_token = false;
|
||||
|
||||
for (token_type, regex_matcher) in ®exes {
|
||||
let found_match = regex_matcher.find(&code[pos..]);
|
||||
|
||||
if let Some(token) = found_match {
|
||||
if !token_type.ignore() {
|
||||
tokens.push(Token::new(
|
||||
&code[pos + token.start()..pos + token.end()],
|
||||
*token_type,
|
||||
CodeLocation::new(0, 0),
|
||||
));
|
||||
}
|
||||
|
||||
valid_token = true;
|
||||
pos += token.end();
|
||||
}
|
||||
}
|
||||
|
||||
if !valid_token {
|
||||
panic!("Invalid token at {pos}");
|
||||
}
|
||||
}
|
||||
|
||||
tokens
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_basic() {
|
||||
let loc = CodeLocation::new(-1, -1);
|
||||
let result = tokenize("if 3 \n\twhile");
|
||||
|
||||
use TokenType::*;
|
||||
assert_eq!(
|
||||
result,
|
||||
vec!(
|
||||
Token::new("if", Identifier, loc),
|
||||
Token::new("3", Integer, loc),
|
||||
Token::new("while", Identifier, loc),
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_comment() {
|
||||
let loc = CodeLocation::new(-1, -1);
|
||||
let result = tokenize("if 3 \n\n\\\\Comment\n#Another\n\twhile");
|
||||
|
||||
use TokenType::*;
|
||||
assert_eq!(
|
||||
result,
|
||||
vec!(
|
||||
Token::new("if", Identifier, loc),
|
||||
Token::new("3", Integer, loc),
|
||||
Token::new("while", Identifier, loc),
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_operators_basic() {
|
||||
let loc = CodeLocation::new(-1, -1);
|
||||
let result = tokenize("var = 1 + 2");
|
||||
|
||||
use TokenType::*;
|
||||
assert_eq!(
|
||||
result,
|
||||
vec!(
|
||||
Token::new("var", Identifier, loc),
|
||||
Token::new("=", Operator, loc),
|
||||
Token::new("1", Integer, loc),
|
||||
Token::new("+", Operator, loc),
|
||||
Token::new("2", Integer, loc),
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_operators_all() {
|
||||
let loc = CodeLocation::new(-1, -1);
|
||||
let result = tokenize("var 1 + - * 1/2 = == != < <= > >= 2");
|
||||
|
||||
use TokenType::*;
|
||||
assert_eq!(
|
||||
result,
|
||||
vec!(
|
||||
Token::new("var", Identifier, loc),
|
||||
Token::new("1", Integer, loc),
|
||||
Token::new("+", Operator, loc),
|
||||
Token::new("-", Operator, loc),
|
||||
Token::new("*", Operator, loc),
|
||||
Token::new("1", Integer, loc),
|
||||
Token::new("/", Operator, loc),
|
||||
Token::new("2", Integer, loc),
|
||||
Token::new("=", Operator, loc),
|
||||
Token::new("==", Operator, loc),
|
||||
Token::new("!=", Operator, loc),
|
||||
Token::new("<", Operator, loc),
|
||||
Token::new("<=", Operator, loc),
|
||||
Token::new(">", Operator, loc),
|
||||
Token::new(">=", Operator, loc),
|
||||
Token::new("2", Integer, loc),
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_punctuation_basic() {
|
||||
let loc = CodeLocation::new(-1, -1);
|
||||
let result = tokenize("{var = (1 + 2, 3);}");
|
||||
|
||||
use TokenType::*;
|
||||
assert_eq!(
|
||||
result,
|
||||
vec!(
|
||||
Token::new("{", Punctuation, loc),
|
||||
Token::new("var", Identifier, loc),
|
||||
Token::new("=", Operator, loc),
|
||||
Token::new("(", Punctuation, loc),
|
||||
Token::new("1", Integer, loc),
|
||||
Token::new("+", Operator, loc),
|
||||
Token::new("2", Integer, loc),
|
||||
Token::new(",", Punctuation, loc),
|
||||
Token::new("3", Integer, loc),
|
||||
Token::new(")", Punctuation, loc),
|
||||
Token::new(";", Punctuation, loc),
|
||||
Token::new("}", Punctuation, loc),
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
Reference in a new issue