1
0
Fork 0

Initial tokenizer implementation

This commit is contained in:
Vili Sinervä 2025-01-18 18:58:14 +02:00
parent f6ac3e60a9
commit 66bdd5f917
No known key found for this signature in database
GPG key ID: DF8FEAF54EFAC996
7 changed files with 276 additions and 1 deletions

45
Cargo.lock generated
View file

@ -2,11 +2,21 @@
# It is not intended for manual editing.
version = 3
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "compiler-course"
version = "0.1.0"
dependencies = [
"json",
"regex",
]
[[package]]
@ -14,3 +24,38 @@ name = "json"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "078e285eafdfb6c4b434e0d31e8cfcb5115b651496faca5749b88fafd4f23bfd"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "regex"
version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"

View file

@ -5,3 +5,4 @@ edition = "2021"
[dependencies]
json = "0.12.4"
regex = "1.11.1"

6
src/compiler.rs Normal file
View file

@ -0,0 +1,6 @@
mod token;
mod tokenizer;
pub fn compile(code: &str) {
tokenizer::tokenize(code);
}

59
src/compiler/token.rs Normal file
View file

@ -0,0 +1,59 @@
#[derive(Debug, Copy, Clone)]
pub struct CodeLocation {
row: i32,
col: i32,
}
impl CodeLocation {
pub fn new(row: i32, col: i32) -> Self {
Self { row, col }
}
}
impl PartialEq for CodeLocation {
fn eq(&self, other: &Self) -> bool {
let true_match = self.row == other.row && self.col == other.col;
// For testing purposes
let simulated_match = self.row < 0 || self.col < 0 || other.row < 0 || other.col < 0;
true_match || simulated_match
}
}
#[derive(Debug, PartialEq, Clone, Copy)]
pub enum TokenType {
Comment,
Integer,
Identifier,
Operator,
Punctuation,
Whitespace,
}
impl TokenType {
pub fn ignore(&self) -> bool {
use TokenType::*;
match self {
Whitespace | Comment => true,
_ => false,
}
}
}
#[derive(Debug, PartialEq)]
pub struct Token {
text: String,
token_type: TokenType,
loc: CodeLocation,
}
impl Token {
pub fn new(text: &str, token_type: TokenType, loc: CodeLocation) -> Self {
Self {
text: text.to_string(),
token_type,
loc,
}
}
}

161
src/compiler/tokenizer.rs Normal file
View file

@ -0,0 +1,161 @@
use crate::compiler::token::{CodeLocation, Token, TokenType};
use regex::Regex;
pub fn tokenize(code: &str) -> Vec<Token> {
// We only want to compile the regexes once
// The ordering of these is important!
let regexes = vec![
(TokenType::Comment, Regex::new(r"^(\\\\|#).*\n").unwrap()),
(TokenType::Whitespace, Regex::new(r"^[\s\t\n]+").unwrap()),
(
TokenType::Operator,
Regex::new(r"^(==|!=|<=|>=|=|<|>|\+|-|\*|/)").unwrap(),
),
(TokenType::Punctuation, Regex::new(r"^[\(\){},;]").unwrap()),
(TokenType::Integer, Regex::new(r"^[0-9]+").unwrap()),
(
TokenType::Identifier,
Regex::new(r"^[[:alpha:]_][[:alpha:]0-9_]*").unwrap(),
),
];
let mut tokens = Vec::new();
let mut pos = 0;
while pos < code.len() {
let mut valid_token = false;
for (token_type, regex_matcher) in &regexes {
let found_match = regex_matcher.find(&code[pos..]);
if let Some(token) = found_match {
if !token_type.ignore() {
tokens.push(Token::new(
&code[pos + token.start()..pos + token.end()],
*token_type,
CodeLocation::new(0, 0),
));
}
valid_token = true;
pos += token.end();
}
}
if !valid_token {
panic!("Invalid token at {pos}");
}
}
tokens
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize_basic() {
let loc = CodeLocation::new(-1, -1);
let result = tokenize("if 3 \n\twhile");
use TokenType::*;
assert_eq!(
result,
vec!(
Token::new("if", Identifier, loc),
Token::new("3", Integer, loc),
Token::new("while", Identifier, loc),
)
);
}
#[test]
fn test_tokenize_comment() {
let loc = CodeLocation::new(-1, -1);
let result = tokenize("if 3 \n\n\\\\Comment\n#Another\n\twhile");
use TokenType::*;
assert_eq!(
result,
vec!(
Token::new("if", Identifier, loc),
Token::new("3", Integer, loc),
Token::new("while", Identifier, loc),
)
);
}
#[test]
fn test_tokenize_operators_basic() {
let loc = CodeLocation::new(-1, -1);
let result = tokenize("var = 1 + 2");
use TokenType::*;
assert_eq!(
result,
vec!(
Token::new("var", Identifier, loc),
Token::new("=", Operator, loc),
Token::new("1", Integer, loc),
Token::new("+", Operator, loc),
Token::new("2", Integer, loc),
)
);
}
#[test]
fn test_tokenize_operators_all() {
let loc = CodeLocation::new(-1, -1);
let result = tokenize("var 1 + - * 1/2 = == != < <= > >= 2");
use TokenType::*;
assert_eq!(
result,
vec!(
Token::new("var", Identifier, loc),
Token::new("1", Integer, loc),
Token::new("+", Operator, loc),
Token::new("-", Operator, loc),
Token::new("*", Operator, loc),
Token::new("1", Integer, loc),
Token::new("/", Operator, loc),
Token::new("2", Integer, loc),
Token::new("=", Operator, loc),
Token::new("==", Operator, loc),
Token::new("!=", Operator, loc),
Token::new("<", Operator, loc),
Token::new("<=", Operator, loc),
Token::new(">", Operator, loc),
Token::new(">=", Operator, loc),
Token::new("2", Integer, loc),
)
);
}
#[test]
fn test_tokenize_punctuation_basic() {
let loc = CodeLocation::new(-1, -1);
let result = tokenize("{var = (1 + 2, 3);}");
use TokenType::*;
assert_eq!(
result,
vec!(
Token::new("{", Punctuation, loc),
Token::new("var", Identifier, loc),
Token::new("=", Operator, loc),
Token::new("(", Punctuation, loc),
Token::new("1", Integer, loc),
Token::new("+", Operator, loc),
Token::new("2", Integer, loc),
Token::new(",", Punctuation, loc),
Token::new("3", Integer, loc),
Token::new(")", Punctuation, loc),
Token::new(";", Punctuation, loc),
Token::new("}", Punctuation, loc),
)
);
}
}

View file

@ -1,3 +1,4 @@
mod compiler;
mod server;
fn main() {

View file

@ -1,3 +1,4 @@
use crate::compiler;
use json;
use std::{
io::prelude::*,
@ -34,7 +35,8 @@ fn handle_connection(mut stream: TcpStream) {
match json_request["command"].as_str().unwrap() {
"ping" => println!("ping"),
"compile" => {
let program = &json_request["code"].as_str().unwrap();
let program = json_request["code"].as_str().unwrap();
compiler::compile(program);
println!("compile code:\n\n{program}\n");
}
_ => panic!("Unexpected command!"),