1
0
Fork 0

Add proper line/char location for tokens

This commit is contained in:
Vili Sinervä 2025-01-21 18:59:20 +02:00
parent c9ef000cd0
commit 28a8ae69be
No known key found for this signature in database
GPG key ID: DF8FEAF54EFAC996
2 changed files with 36 additions and 30 deletions

View file

@ -1,24 +1,24 @@
#[derive(Debug, Copy, Clone)] #[derive(Debug, Copy, Clone)]
pub struct CodeLocation { pub struct CodeLocation {
start: usize, line: usize,
end: usize, char: usize,
} }
impl CodeLocation { impl CodeLocation {
pub fn new(start: usize, end: usize) -> Self { pub fn new(line: usize, char: usize) -> Self {
Self { start, end } Self { line, char }
} }
} }
impl PartialEq for CodeLocation { impl PartialEq for CodeLocation {
fn eq(&self, other: &Self) -> bool { fn eq(&self, other: &Self) -> bool {
let true_match = self.start == other.start && self.end == other.end; let true_match = self.line == other.line && self.char == other.char;
// For testing purposes // For testing purposes
let simulated_match = self.start == usize::MAX let simulated_match = self.line == usize::MAX
|| self.end == usize::MAX || self.char == usize::MAX
|| other.start == usize::MAX || other.line == usize::MAX
|| other.end == usize::MAX; || other.char == usize::MAX;
true_match || simulated_match true_match || simulated_match
} }

View file

@ -5,7 +5,7 @@ pub fn tokenize(code: &str) -> Vec<Token> {
// We only want to compile the regexes once // We only want to compile the regexes once
// The ordering of these is important! // The ordering of these is important!
let regexes = vec![ let regexes = vec![
(TokenType::Comment, Regex::new(r"^(\\\\|#).*\n").unwrap()), (TokenType::Comment, Regex::new(r"^(//|#).*").unwrap()),
(TokenType::Whitespace, Regex::new(r"^[\s\t\n]+").unwrap()), (TokenType::Whitespace, Regex::new(r"^[\s\t\n]+").unwrap()),
( (
TokenType::Operator, TokenType::Operator,
@ -21,32 +21,38 @@ pub fn tokenize(code: &str) -> Vec<Token> {
let mut tokens = Vec::new(); let mut tokens = Vec::new();
let mut pos = 0; for (line_number, line) in code.lines().enumerate() {
let mut pos = 0;
while pos < code.len() { while pos < line.len() {
let mut valid_token = false; let mut valid_token = false;
for (token_type, regex_matcher) in &regexes { for (token_type, regex_matcher) in &regexes {
let found_match = regex_matcher.find(&code[pos..]); let found_match = regex_matcher.find(&line[pos..]);
if let Some(token) = found_match { if let Some(token) = found_match {
if !token_type.ignore() { if !token_type.ignore() {
let start = pos + token.start(); let start = pos + token.start();
let end = pos + token.end(); let end = pos + token.end();
tokens.push(Token::new( tokens.push(Token::new(
&code[start..end], &line[start..end],
*token_type, *token_type,
CodeLocation::new(start, end), CodeLocation::new(line_number + 1, start + 1), // 1-indexing
)); ));
}
valid_token = true;
pos += token.end();
} }
valid_token = true;
pos += token.end();
} }
}
if !valid_token { if !valid_token {
panic!("Invalid token at {pos}"); panic!(
"Invalid token on line {} in position {}",
line_number + 1,
pos + 1
);
}
} }
} }