1
0
Fork 0

Initial tokenizer implementation

This commit is contained in:
Vili Sinervä 2025-01-18 18:58:14 +02:00
parent f6ac3e60a9
commit 66bdd5f917
No known key found for this signature in database
GPG key ID: DF8FEAF54EFAC996
7 changed files with 276 additions and 1 deletions

59
src/compiler/token.rs Normal file
View file

@ -0,0 +1,59 @@
#[derive(Debug, Copy, Clone)]
pub struct CodeLocation {
row: i32,
col: i32,
}
impl CodeLocation {
pub fn new(row: i32, col: i32) -> Self {
Self { row, col }
}
}
impl PartialEq for CodeLocation {
fn eq(&self, other: &Self) -> bool {
let true_match = self.row == other.row && self.col == other.col;
// For testing purposes
let simulated_match = self.row < 0 || self.col < 0 || other.row < 0 || other.col < 0;
true_match || simulated_match
}
}
#[derive(Debug, PartialEq, Clone, Copy)]
pub enum TokenType {
Comment,
Integer,
Identifier,
Operator,
Punctuation,
Whitespace,
}
impl TokenType {
pub fn ignore(&self) -> bool {
use TokenType::*;
match self {
Whitespace | Comment => true,
_ => false,
}
}
}
#[derive(Debug, PartialEq)]
pub struct Token {
text: String,
token_type: TokenType,
loc: CodeLocation,
}
impl Token {
pub fn new(text: &str, token_type: TokenType, loc: CodeLocation) -> Self {
Self {
text: text.to_string(),
token_type,
loc,
}
}
}

161
src/compiler/tokenizer.rs Normal file
View file

@ -0,0 +1,161 @@
use crate::compiler::token::{CodeLocation, Token, TokenType};
use regex::Regex;
pub fn tokenize(code: &str) -> Vec<Token> {
// We only want to compile the regexes once
// The ordering of these is important!
let regexes = vec![
(TokenType::Comment, Regex::new(r"^(\\\\|#).*\n").unwrap()),
(TokenType::Whitespace, Regex::new(r"^[\s\t\n]+").unwrap()),
(
TokenType::Operator,
Regex::new(r"^(==|!=|<=|>=|=|<|>|\+|-|\*|/)").unwrap(),
),
(TokenType::Punctuation, Regex::new(r"^[\(\){},;]").unwrap()),
(TokenType::Integer, Regex::new(r"^[0-9]+").unwrap()),
(
TokenType::Identifier,
Regex::new(r"^[[:alpha:]_][[:alpha:]0-9_]*").unwrap(),
),
];
let mut tokens = Vec::new();
let mut pos = 0;
while pos < code.len() {
let mut valid_token = false;
for (token_type, regex_matcher) in &regexes {
let found_match = regex_matcher.find(&code[pos..]);
if let Some(token) = found_match {
if !token_type.ignore() {
tokens.push(Token::new(
&code[pos + token.start()..pos + token.end()],
*token_type,
CodeLocation::new(0, 0),
));
}
valid_token = true;
pos += token.end();
}
}
if !valid_token {
panic!("Invalid token at {pos}");
}
}
tokens
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize_basic() {
let loc = CodeLocation::new(-1, -1);
let result = tokenize("if 3 \n\twhile");
use TokenType::*;
assert_eq!(
result,
vec!(
Token::new("if", Identifier, loc),
Token::new("3", Integer, loc),
Token::new("while", Identifier, loc),
)
);
}
#[test]
fn test_tokenize_comment() {
let loc = CodeLocation::new(-1, -1);
let result = tokenize("if 3 \n\n\\\\Comment\n#Another\n\twhile");
use TokenType::*;
assert_eq!(
result,
vec!(
Token::new("if", Identifier, loc),
Token::new("3", Integer, loc),
Token::new("while", Identifier, loc),
)
);
}
#[test]
fn test_tokenize_operators_basic() {
let loc = CodeLocation::new(-1, -1);
let result = tokenize("var = 1 + 2");
use TokenType::*;
assert_eq!(
result,
vec!(
Token::new("var", Identifier, loc),
Token::new("=", Operator, loc),
Token::new("1", Integer, loc),
Token::new("+", Operator, loc),
Token::new("2", Integer, loc),
)
);
}
#[test]
fn test_tokenize_operators_all() {
let loc = CodeLocation::new(-1, -1);
let result = tokenize("var 1 + - * 1/2 = == != < <= > >= 2");
use TokenType::*;
assert_eq!(
result,
vec!(
Token::new("var", Identifier, loc),
Token::new("1", Integer, loc),
Token::new("+", Operator, loc),
Token::new("-", Operator, loc),
Token::new("*", Operator, loc),
Token::new("1", Integer, loc),
Token::new("/", Operator, loc),
Token::new("2", Integer, loc),
Token::new("=", Operator, loc),
Token::new("==", Operator, loc),
Token::new("!=", Operator, loc),
Token::new("<", Operator, loc),
Token::new("<=", Operator, loc),
Token::new(">", Operator, loc),
Token::new(">=", Operator, loc),
Token::new("2", Integer, loc),
)
);
}
#[test]
fn test_tokenize_punctuation_basic() {
let loc = CodeLocation::new(-1, -1);
let result = tokenize("{var = (1 + 2, 3);}");
use TokenType::*;
assert_eq!(
result,
vec!(
Token::new("{", Punctuation, loc),
Token::new("var", Identifier, loc),
Token::new("=", Operator, loc),
Token::new("(", Punctuation, loc),
Token::new("1", Integer, loc),
Token::new("+", Operator, loc),
Token::new("2", Integer, loc),
Token::new(",", Punctuation, loc),
Token::new("3", Integer, loc),
Token::new(")", Punctuation, loc),
Token::new(";", Punctuation, loc),
Token::new("}", Punctuation, loc),
)
);
}
}