Initial tokenizer implementation
This commit is contained in:
parent
f6ac3e60a9
commit
66bdd5f917
7 changed files with 276 additions and 1 deletions
45
Cargo.lock
generated
45
Cargo.lock
generated
|
@ -2,11 +2,21 @@
|
||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
version = 3
|
version = 3
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "1.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "compiler-course"
|
name = "compiler-course"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"json",
|
"json",
|
||||||
|
"regex",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -14,3 +24,38 @@ name = "json"
|
||||||
version = "0.12.4"
|
version = "0.12.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "078e285eafdfb6c4b434e0d31e8cfcb5115b651496faca5749b88fafd4f23bfd"
|
checksum = "078e285eafdfb6c4b434e0d31e8cfcb5115b651496faca5749b88fafd4f23bfd"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memchr"
|
||||||
|
version = "2.7.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex"
|
||||||
|
version = "1.11.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-automata",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-automata"
|
||||||
|
version = "0.4.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-syntax"
|
||||||
|
version = "0.8.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
||||||
|
|
|
@ -5,3 +5,4 @@ edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
json = "0.12.4"
|
json = "0.12.4"
|
||||||
|
regex = "1.11.1"
|
||||||
|
|
6
src/compiler.rs
Normal file
6
src/compiler.rs
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
mod token;
|
||||||
|
mod tokenizer;
|
||||||
|
|
||||||
|
pub fn compile(code: &str) {
|
||||||
|
tokenizer::tokenize(code);
|
||||||
|
}
|
59
src/compiler/token.rs
Normal file
59
src/compiler/token.rs
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
#[derive(Debug, Copy, Clone)]
|
||||||
|
pub struct CodeLocation {
|
||||||
|
row: i32,
|
||||||
|
col: i32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CodeLocation {
|
||||||
|
pub fn new(row: i32, col: i32) -> Self {
|
||||||
|
Self { row, col }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq for CodeLocation {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
let true_match = self.row == other.row && self.col == other.col;
|
||||||
|
|
||||||
|
// For testing purposes
|
||||||
|
let simulated_match = self.row < 0 || self.col < 0 || other.row < 0 || other.col < 0;
|
||||||
|
|
||||||
|
true_match || simulated_match
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Clone, Copy)]
|
||||||
|
pub enum TokenType {
|
||||||
|
Comment,
|
||||||
|
Integer,
|
||||||
|
Identifier,
|
||||||
|
Operator,
|
||||||
|
Punctuation,
|
||||||
|
Whitespace,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TokenType {
|
||||||
|
pub fn ignore(&self) -> bool {
|
||||||
|
use TokenType::*;
|
||||||
|
match self {
|
||||||
|
Whitespace | Comment => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq)]
|
||||||
|
pub struct Token {
|
||||||
|
text: String,
|
||||||
|
token_type: TokenType,
|
||||||
|
loc: CodeLocation,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Token {
|
||||||
|
pub fn new(text: &str, token_type: TokenType, loc: CodeLocation) -> Self {
|
||||||
|
Self {
|
||||||
|
text: text.to_string(),
|
||||||
|
token_type,
|
||||||
|
loc,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
161
src/compiler/tokenizer.rs
Normal file
161
src/compiler/tokenizer.rs
Normal file
|
@ -0,0 +1,161 @@
|
||||||
|
use crate::compiler::token::{CodeLocation, Token, TokenType};
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
pub fn tokenize(code: &str) -> Vec<Token> {
|
||||||
|
// We only want to compile the regexes once
|
||||||
|
// The ordering of these is important!
|
||||||
|
let regexes = vec![
|
||||||
|
(TokenType::Comment, Regex::new(r"^(\\\\|#).*\n").unwrap()),
|
||||||
|
(TokenType::Whitespace, Regex::new(r"^[\s\t\n]+").unwrap()),
|
||||||
|
(
|
||||||
|
TokenType::Operator,
|
||||||
|
Regex::new(r"^(==|!=|<=|>=|=|<|>|\+|-|\*|/)").unwrap(),
|
||||||
|
),
|
||||||
|
(TokenType::Punctuation, Regex::new(r"^[\(\){},;]").unwrap()),
|
||||||
|
(TokenType::Integer, Regex::new(r"^[0-9]+").unwrap()),
|
||||||
|
(
|
||||||
|
TokenType::Identifier,
|
||||||
|
Regex::new(r"^[[:alpha:]_][[:alpha:]0-9_]*").unwrap(),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut tokens = Vec::new();
|
||||||
|
|
||||||
|
let mut pos = 0;
|
||||||
|
|
||||||
|
while pos < code.len() {
|
||||||
|
let mut valid_token = false;
|
||||||
|
|
||||||
|
for (token_type, regex_matcher) in ®exes {
|
||||||
|
let found_match = regex_matcher.find(&code[pos..]);
|
||||||
|
|
||||||
|
if let Some(token) = found_match {
|
||||||
|
if !token_type.ignore() {
|
||||||
|
tokens.push(Token::new(
|
||||||
|
&code[pos + token.start()..pos + token.end()],
|
||||||
|
*token_type,
|
||||||
|
CodeLocation::new(0, 0),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
valid_token = true;
|
||||||
|
pos += token.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !valid_token {
|
||||||
|
panic!("Invalid token at {pos}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tokens
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tokenize_basic() {
|
||||||
|
let loc = CodeLocation::new(-1, -1);
|
||||||
|
let result = tokenize("if 3 \n\twhile");
|
||||||
|
|
||||||
|
use TokenType::*;
|
||||||
|
assert_eq!(
|
||||||
|
result,
|
||||||
|
vec!(
|
||||||
|
Token::new("if", Identifier, loc),
|
||||||
|
Token::new("3", Integer, loc),
|
||||||
|
Token::new("while", Identifier, loc),
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tokenize_comment() {
|
||||||
|
let loc = CodeLocation::new(-1, -1);
|
||||||
|
let result = tokenize("if 3 \n\n\\\\Comment\n#Another\n\twhile");
|
||||||
|
|
||||||
|
use TokenType::*;
|
||||||
|
assert_eq!(
|
||||||
|
result,
|
||||||
|
vec!(
|
||||||
|
Token::new("if", Identifier, loc),
|
||||||
|
Token::new("3", Integer, loc),
|
||||||
|
Token::new("while", Identifier, loc),
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tokenize_operators_basic() {
|
||||||
|
let loc = CodeLocation::new(-1, -1);
|
||||||
|
let result = tokenize("var = 1 + 2");
|
||||||
|
|
||||||
|
use TokenType::*;
|
||||||
|
assert_eq!(
|
||||||
|
result,
|
||||||
|
vec!(
|
||||||
|
Token::new("var", Identifier, loc),
|
||||||
|
Token::new("=", Operator, loc),
|
||||||
|
Token::new("1", Integer, loc),
|
||||||
|
Token::new("+", Operator, loc),
|
||||||
|
Token::new("2", Integer, loc),
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tokenize_operators_all() {
|
||||||
|
let loc = CodeLocation::new(-1, -1);
|
||||||
|
let result = tokenize("var 1 + - * 1/2 = == != < <= > >= 2");
|
||||||
|
|
||||||
|
use TokenType::*;
|
||||||
|
assert_eq!(
|
||||||
|
result,
|
||||||
|
vec!(
|
||||||
|
Token::new("var", Identifier, loc),
|
||||||
|
Token::new("1", Integer, loc),
|
||||||
|
Token::new("+", Operator, loc),
|
||||||
|
Token::new("-", Operator, loc),
|
||||||
|
Token::new("*", Operator, loc),
|
||||||
|
Token::new("1", Integer, loc),
|
||||||
|
Token::new("/", Operator, loc),
|
||||||
|
Token::new("2", Integer, loc),
|
||||||
|
Token::new("=", Operator, loc),
|
||||||
|
Token::new("==", Operator, loc),
|
||||||
|
Token::new("!=", Operator, loc),
|
||||||
|
Token::new("<", Operator, loc),
|
||||||
|
Token::new("<=", Operator, loc),
|
||||||
|
Token::new(">", Operator, loc),
|
||||||
|
Token::new(">=", Operator, loc),
|
||||||
|
Token::new("2", Integer, loc),
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tokenize_punctuation_basic() {
|
||||||
|
let loc = CodeLocation::new(-1, -1);
|
||||||
|
let result = tokenize("{var = (1 + 2, 3);}");
|
||||||
|
|
||||||
|
use TokenType::*;
|
||||||
|
assert_eq!(
|
||||||
|
result,
|
||||||
|
vec!(
|
||||||
|
Token::new("{", Punctuation, loc),
|
||||||
|
Token::new("var", Identifier, loc),
|
||||||
|
Token::new("=", Operator, loc),
|
||||||
|
Token::new("(", Punctuation, loc),
|
||||||
|
Token::new("1", Integer, loc),
|
||||||
|
Token::new("+", Operator, loc),
|
||||||
|
Token::new("2", Integer, loc),
|
||||||
|
Token::new(",", Punctuation, loc),
|
||||||
|
Token::new("3", Integer, loc),
|
||||||
|
Token::new(")", Punctuation, loc),
|
||||||
|
Token::new(";", Punctuation, loc),
|
||||||
|
Token::new("}", Punctuation, loc),
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,3 +1,4 @@
|
||||||
|
mod compiler;
|
||||||
mod server;
|
mod server;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
use crate::compiler;
|
||||||
use json;
|
use json;
|
||||||
use std::{
|
use std::{
|
||||||
io::prelude::*,
|
io::prelude::*,
|
||||||
|
@ -34,7 +35,8 @@ fn handle_connection(mut stream: TcpStream) {
|
||||||
match json_request["command"].as_str().unwrap() {
|
match json_request["command"].as_str().unwrap() {
|
||||||
"ping" => println!("ping"),
|
"ping" => println!("ping"),
|
||||||
"compile" => {
|
"compile" => {
|
||||||
let program = &json_request["code"].as_str().unwrap();
|
let program = json_request["code"].as_str().unwrap();
|
||||||
|
compiler::compile(program);
|
||||||
println!("compile code:\n\n{program}\n");
|
println!("compile code:\n\n{program}\n");
|
||||||
}
|
}
|
||||||
_ => panic!("Unexpected command!"),
|
_ => panic!("Unexpected command!"),
|
||||||
|
|
Reference in a new issue