From 66bdd5f9173428dbf8f6b9206bff32a3d9608db0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vili=20Sinerv=C3=A4?= Date: Sat, 18 Jan 2025 18:58:14 +0200 Subject: [PATCH] Initial tokenizer implementation --- Cargo.lock | 45 +++++++++++ Cargo.toml | 1 + src/compiler.rs | 6 ++ src/compiler/token.rs | 59 ++++++++++++++ src/compiler/tokenizer.rs | 161 ++++++++++++++++++++++++++++++++++++++ src/main.rs | 1 + src/server.rs | 4 +- 7 files changed, 276 insertions(+), 1 deletion(-) create mode 100644 src/compiler.rs create mode 100644 src/compiler/token.rs create mode 100644 src/compiler/tokenizer.rs diff --git a/Cargo.lock b/Cargo.lock index 98dcbec..f0b3f00 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,11 +2,21 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "compiler-course" version = "0.1.0" dependencies = [ "json", + "regex", ] [[package]] @@ -14,3 +24,38 @@ name = "json" version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "078e285eafdfb6c4b434e0d31e8cfcb5115b651496faca5749b88fafd4f23bfd" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" diff --git a/Cargo.toml b/Cargo.toml index 551a6be..aa8658a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,3 +5,4 @@ edition = "2021" [dependencies] json = "0.12.4" +regex = "1.11.1" diff --git a/src/compiler.rs b/src/compiler.rs new file mode 100644 index 0000000..bc1e972 --- /dev/null +++ b/src/compiler.rs @@ -0,0 +1,6 @@ +mod token; +mod tokenizer; + +pub fn compile(code: &str) { + tokenizer::tokenize(code); +} diff --git a/src/compiler/token.rs b/src/compiler/token.rs new file mode 100644 index 0000000..53ef9c5 --- /dev/null +++ b/src/compiler/token.rs @@ -0,0 +1,59 @@ +#[derive(Debug, Copy, Clone)] +pub struct CodeLocation { + row: i32, + col: i32, +} + +impl CodeLocation { + pub fn new(row: i32, col: i32) -> Self { + Self { row, col } + } +} + +impl PartialEq for CodeLocation { + fn eq(&self, other: &Self) -> bool { + let true_match = self.row == other.row && self.col == other.col; + + // For testing purposes + let simulated_match = self.row < 0 || self.col < 0 || other.row < 0 || other.col < 0; + + true_match || simulated_match + } +} + +#[derive(Debug, PartialEq, Clone, Copy)] +pub enum TokenType { + Comment, + Integer, + Identifier, + Operator, + Punctuation, + Whitespace, +} + +impl TokenType { + pub fn ignore(&self) -> bool { + use TokenType::*; + match self { + Whitespace | Comment => true, + _ => false, + } + } +} + +#[derive(Debug, PartialEq)] +pub struct Token { + text: String, + token_type: TokenType, + loc: CodeLocation, +} + +impl Token { + pub fn new(text: &str, token_type: TokenType, loc: CodeLocation) -> Self { + Self { + text: text.to_string(), + token_type, + loc, + } + } +} diff --git a/src/compiler/tokenizer.rs b/src/compiler/tokenizer.rs new file mode 100644 index 0000000..19ca518 --- /dev/null +++ b/src/compiler/tokenizer.rs @@ -0,0 +1,161 @@ +use crate::compiler::token::{CodeLocation, Token, TokenType}; +use regex::Regex; + +pub fn tokenize(code: &str) -> Vec { + // We only want to compile the regexes once + // The ordering of these is important! + let regexes = vec![ + (TokenType::Comment, Regex::new(r"^(\\\\|#).*\n").unwrap()), + (TokenType::Whitespace, Regex::new(r"^[\s\t\n]+").unwrap()), + ( + TokenType::Operator, + Regex::new(r"^(==|!=|<=|>=|=|<|>|\+|-|\*|/)").unwrap(), + ), + (TokenType::Punctuation, Regex::new(r"^[\(\){},;]").unwrap()), + (TokenType::Integer, Regex::new(r"^[0-9]+").unwrap()), + ( + TokenType::Identifier, + Regex::new(r"^[[:alpha:]_][[:alpha:]0-9_]*").unwrap(), + ), + ]; + + let mut tokens = Vec::new(); + + let mut pos = 0; + + while pos < code.len() { + let mut valid_token = false; + + for (token_type, regex_matcher) in ®exes { + let found_match = regex_matcher.find(&code[pos..]); + + if let Some(token) = found_match { + if !token_type.ignore() { + tokens.push(Token::new( + &code[pos + token.start()..pos + token.end()], + *token_type, + CodeLocation::new(0, 0), + )); + } + + valid_token = true; + pos += token.end(); + } + } + + if !valid_token { + panic!("Invalid token at {pos}"); + } + } + + tokens +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_tokenize_basic() { + let loc = CodeLocation::new(-1, -1); + let result = tokenize("if 3 \n\twhile"); + + use TokenType::*; + assert_eq!( + result, + vec!( + Token::new("if", Identifier, loc), + Token::new("3", Integer, loc), + Token::new("while", Identifier, loc), + ) + ); + } + + #[test] + fn test_tokenize_comment() { + let loc = CodeLocation::new(-1, -1); + let result = tokenize("if 3 \n\n\\\\Comment\n#Another\n\twhile"); + + use TokenType::*; + assert_eq!( + result, + vec!( + Token::new("if", Identifier, loc), + Token::new("3", Integer, loc), + Token::new("while", Identifier, loc), + ) + ); + } + + #[test] + fn test_tokenize_operators_basic() { + let loc = CodeLocation::new(-1, -1); + let result = tokenize("var = 1 + 2"); + + use TokenType::*; + assert_eq!( + result, + vec!( + Token::new("var", Identifier, loc), + Token::new("=", Operator, loc), + Token::new("1", Integer, loc), + Token::new("+", Operator, loc), + Token::new("2", Integer, loc), + ) + ); + } + + #[test] + fn test_tokenize_operators_all() { + let loc = CodeLocation::new(-1, -1); + let result = tokenize("var 1 + - * 1/2 = == != < <= > >= 2"); + + use TokenType::*; + assert_eq!( + result, + vec!( + Token::new("var", Identifier, loc), + Token::new("1", Integer, loc), + Token::new("+", Operator, loc), + Token::new("-", Operator, loc), + Token::new("*", Operator, loc), + Token::new("1", Integer, loc), + Token::new("/", Operator, loc), + Token::new("2", Integer, loc), + Token::new("=", Operator, loc), + Token::new("==", Operator, loc), + Token::new("!=", Operator, loc), + Token::new("<", Operator, loc), + Token::new("<=", Operator, loc), + Token::new(">", Operator, loc), + Token::new(">=", Operator, loc), + Token::new("2", Integer, loc), + ) + ); + } + + #[test] + fn test_tokenize_punctuation_basic() { + let loc = CodeLocation::new(-1, -1); + let result = tokenize("{var = (1 + 2, 3);}"); + + use TokenType::*; + assert_eq!( + result, + vec!( + Token::new("{", Punctuation, loc), + Token::new("var", Identifier, loc), + Token::new("=", Operator, loc), + Token::new("(", Punctuation, loc), + Token::new("1", Integer, loc), + Token::new("+", Operator, loc), + Token::new("2", Integer, loc), + Token::new(",", Punctuation, loc), + Token::new("3", Integer, loc), + Token::new(")", Punctuation, loc), + Token::new(";", Punctuation, loc), + Token::new("}", Punctuation, loc), + ) + ); + } +} diff --git a/src/main.rs b/src/main.rs index bdc4a37..723e96d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,4 @@ +mod compiler; mod server; fn main() { diff --git a/src/server.rs b/src/server.rs index f731a92..77decf0 100644 --- a/src/server.rs +++ b/src/server.rs @@ -1,3 +1,4 @@ +use crate::compiler; use json; use std::{ io::prelude::*, @@ -34,7 +35,8 @@ fn handle_connection(mut stream: TcpStream) { match json_request["command"].as_str().unwrap() { "ping" => println!("ping"), "compile" => { - let program = &json_request["code"].as_str().unwrap(); + let program = json_request["code"].as_str().unwrap(); + compiler::compile(program); println!("compile code:\n\n{program}\n"); } _ => panic!("Unexpected command!"),