diff --git a/Cargo.lock b/Cargo.lock index d4213ed..c840342 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,279 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783" +dependencies = [ + "memchr", +] + +[[package]] +name = "anyhow" +version = "1.0.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "futures" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" + +[[package]] +name = "futures-executor" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" + +[[package]] +name = "futures-macro" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" + +[[package]] +name = "futures-task" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" + +[[package]] +name = "futures-timer" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" + +[[package]] +name = "futures-util" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "mal" version = "0.1.0" +dependencies = [ + "anyhow", + "rstest", +] + +[[package]] +name = "memchr" +version = "2.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" + +[[package]] +name = "pin-project-lite" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "proc-macro2" +version = "1.0.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" + +[[package]] +name = "relative-path" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c707298afce11da2efef2f600116fa93ffa7a032b5d7b628aa17711ec81383ca" + +[[package]] +name = "rstest" +version = "0.18.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97eeab2f3c0a199bc4be135c36c924b6590b88c377d416494288c14f2db30199" +dependencies = [ + "futures", + "futures-timer", + "rstest_macros", + "rustc_version", +] + +[[package]] +name = "rstest_macros" +version = "0.18.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d428f8247852f894ee1be110b375111b586d4fa431f6c46e64ba5a0dcccbe605" +dependencies = [ + "cfg-if", + "glob", + "proc-macro2", + "quote", + "regex", + "relative-path", + "rustc_version", + "syn", + "unicode-ident", +] + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "semver" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918" + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "syn" +version = "2.0.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" diff --git a/Cargo.toml b/Cargo.toml index 647a9a7..04df988 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,3 +6,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +anyhow = "1.0.75" + +[dev-dependencies] +rstest = "0.18.2" diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..34cf0e8 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,253 @@ +use std::{iter::Peekable, str::Chars}; + +use anyhow::{bail, Result}; + +#[derive(Debug, PartialEq, PartialOrd)] +pub enum Token { + LeftParen, + RightParen, + + LeftBracket, + RightBracket, + + LeftBrace, + RightBrace, + + WeirdSign, + Apostrophe, + Grave, + Tilde, + Carot, + AtSign, + + // Math Operators + Plus, + Minus, + Asterisk, + Slash, + + // Values + Keyword(String), + Int(i64), + String(String), + Ident(String), + True, + False, + Nil, +} + +pub fn read(input: &str) -> Result> { + let mut input = input.chars().peekable(); + let mut tokens = Vec::new(); + + while let Some(tok) = next_token(&mut input)? { + tokens.push(tok) + } + + Ok(tokens) +} + +fn next_token(input: &mut Peekable) -> Result> { + let tok = match input.next() { + Some(tok) => tok, + None => return Ok(None), + }; + + let tok = match tok { + // Weird sign + '~' if input.peek().is_some_and(|c| c == &'@') => { + // Munch the @ + input.next(); + Token::WeirdSign + } + + // Negative numbers + '-' if input.peek().is_some_and(|c| c.is_ascii_digit()) => read_int(input, '-'), + + // Munch comments + ';' => { + for c in input.by_ref() { + if c == '\n' { + break; + } + } + + match next_token(input)? { + Some(tok) => tok, + None => return Ok(None), + } + } + + '(' => Token::LeftParen, + ')' => Token::RightParen, + + '[' => Token::LeftBracket, + ']' => Token::RightBracket, + '{' => Token::LeftBrace, + '}' => Token::RightBrace, + + '\'' => Token::Apostrophe, + '`' => Token::Grave, + '~' => Token::Tilde, + '^' => Token::Carot, + '@' => Token::AtSign, + + '+' => Token::Plus, + '-' => Token::Minus, + '*' => Token::Asterisk, + '/' => Token::Slash, + + '"' => read_string(input)?, + ':' => read_keyword(input), + + c if c.is_ascii_digit() => read_int(input, c), + c if c.is_ascii_alphabetic() => read_ident(input, c), + + // Munch whitespace + c if c.is_whitespace() => match next_token(input)? { + Some(tok) => tok, + None => return Ok(None), + }, + _ => bail!("ilegal token"), + }; + + Ok(Some(tok)) +} + +fn read_string(input: &mut Peekable) -> Result { + let mut raw_str = Vec::new(); + + loop { + match input.peek() { + Some(&'"') => { + // We want to eat the tailing " + input.next(); + break; + } + + Some(_) => (), + None => bail!("unbalanced string"), + } + + raw_str.push(input.next().unwrap()) + } + + Ok(Token::String(raw_str.into_iter().collect())) +} + +fn read_keyword(input: &mut Peekable) -> Token { + let mut raw_keyword = Vec::new(); + + while let Some(c) = input.peek() { + if !c.is_ascii_alphanumeric() { + break; + } + + raw_keyword.push(input.next().unwrap()); + } + + Token::Keyword(raw_keyword.into_iter().collect()) +} + +fn read_int(input: &mut Peekable, first: char) -> Token { + let mut raw_int = vec![first]; + + while let Some(c) = input.peek() { + if !c.is_ascii_digit() { + break; + } + + raw_int.push(input.next().unwrap()); + } + + Token::Int(raw_int.iter().collect::().parse::().unwrap()) +} + +fn read_ident(input: &mut Peekable, first: char) -> Token { + let mut raw_ident = vec![first]; + + while let Some(c) = input.peek() { + if !c.is_ascii_alphanumeric() { + break; + } + + raw_ident.push(input.next().unwrap()) + } + + let ident = raw_ident.into_iter().collect::(); + + match ident.as_str() { + "true" => Token::True, + "false" => Token::False, + "nil" => Token::Nil, + ident => Token::Ident(ident.to_owned()), + } +} + +#[cfg(test)] +mod test { + use super::*; + use rstest::rstest; + + #[rstest] + #[case("()[]{}", vec![Token::LeftParen, Token::RightParen, Token::LeftBracket, Token::RightBracket, Token::LeftBrace, Token::RightBrace])] + #[case(" ' ` ^ ~@ ~ @", vec![Token::Apostrophe, Token::Grave, Token::Carot, Token::WeirdSign, Token::Tilde, Token::AtSign])] + #[case("(+ 1 2)", vec![Token::LeftParen, Token::Plus, Token::Int(1), Token::Int(2), Token::RightParen])] + #[case("(- 1 2)", vec![Token::LeftParen, Token::Minus, Token::Int(1), Token::Int(2), Token::RightParen])] + #[case("(* 1 2)", vec![Token::LeftParen, Token::Asterisk, Token::Int(1), Token::Int(2), Token::RightParen])] + #[case("(/ 1 2)", vec![Token::LeftParen, Token::Slash, Token::Int(1), Token::Int(2), Token::RightParen])] + #[case("(- -2 1)", vec![Token::LeftParen, Token::Minus, Token::Int(-2), Token::Int(1), Token::RightParen])] + #[case("(\"string and stuff\")", vec![Token::LeftParen, Token::String("string and stuff".into()), Token::RightParen])] + #[case( + "(func a b)", + vec![ + Token::LeftParen, + Token::Ident("func".into()), + Token::Ident("a".into()), + Token::Ident("b".into()), + Token::RightParen + ] + )] + #[case( + "(+ 1 (- 2 1))", + vec![ + Token::LeftParen, + Token::Plus, + Token::Int(1), + Token::LeftParen, + Token::Minus, + Token::Int(2), + Token::Int(1), + Token::RightParen, + Token::RightParen + ] + )] + #[case( + "(fn a ;; This comment is useless + (+ 1 2))", + vec![ + Token::LeftParen, + Token::Ident("fn".into()), + Token::Ident("a".into()), + Token::LeftParen, + Token::Plus, + Token::Int(1), + Token::Int(2), + Token::RightParen, + Token::RightParen + + ] + )] + fn test_lexer(#[case] input: &str, #[case] expected: Vec) { + let res = read(input).unwrap(); + assert_eq!(res, expected); + } + + #[rstest] + // Unbalanced string + #[case("(\"asdf)")] + fn test_lexer_errors(#[case] input: &str) { + let res = read(input); + assert!(res.is_err()); + } +} diff --git a/src/main.rs b/src/main.rs index 24a3688..fa028d6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,8 @@ use std::io::{self, Write}; +mod lexer; +mod parser; + fn main() { let mut input = String::new(); @@ -17,8 +20,9 @@ fn main() { break; } - let ast = read(&input); - let res = eval(&ast); + let tokens = lexer::read(&input).unwrap(); + let ast = parser::parse(tokens).unwrap(); + let res = eval(ast); println!("{res}"); @@ -26,10 +30,6 @@ fn main() { } } -fn read(input: &str) -> String { - input.to_owned() -} - -fn eval(input: &str) -> String { - input.to_owned() +fn eval(input: Vec) -> String { + format!("{input:?}") } diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..9461282 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,226 @@ +use std::{iter::Peekable, vec::IntoIter}; + +use anyhow::{bail, Result}; + +use crate::lexer::Token; + +#[derive(Debug, PartialEq, PartialOrd)] +pub enum Node { + List(Vec), + Vector(Vec), + HashMap(Vec), + + Symbol(String), + Keyword(String), + Int(i64), + String(String), + True, + False, + Nil, +} + +pub fn parse(tokens: Vec) -> Result> { + let mut tokens = tokens.into_iter().peekable(); + let mut ast = Vec::new(); + + while let Some(node) = next_statement(&mut tokens)? { + ast.push(node) + } + + Ok(ast) +} + +fn next_statement(tokens: &mut Peekable>) -> Result> { + let tok = match tokens.next() { + Some(tok) => tok, + None => return Ok(None), + }; + + let node = match tok { + Token::LeftParen => read_list(tokens, Token::RightParen)?, + Token::RightParen => bail!("closing parenthsis does not have matching open parenthesis"), + + Token::LeftBracket => read_list(tokens, Token::RightBracket)?, + Token::RightBracket => bail!("closing bracket does not have matching open bracket"), + + Token::LeftBrace => read_list(tokens, Token::RightBrace)?, + Token::RightBrace => bail!("closing brace does not have matching open brace"), + + Token::WeirdSign => read_quote(tokens, "splice-unquote")?, + Token::Apostrophe => read_quote(tokens, "quote")?, + Token::Grave => read_quote(tokens, "quasiquote")?, + Token::Tilde => read_quote(tokens, "unquote")?, + + // TODO: meta + Token::Carot => todo!(), + // TODO: deref + Token::AtSign => todo!(), + + Token::Plus => Node::Symbol("+".into()), + Token::Minus => Node::Symbol("-".into()), + Token::Asterisk => Node::Symbol("*".into()), + Token::Slash => Node::Symbol("/".into()), + + Token::Keyword(val) => Node::Keyword(val), + Token::Ident(val) => Node::Symbol(val), + Token::String(val) => Node::String(val), + Token::Int(int) => Node::Int(int), + Token::True => Node::True, + Token::False => Node::False, + Token::Nil => Node::Nil, + }; + + Ok(Some(node)) +} + +fn read_list(tokens: &mut Peekable>, closer: Token) -> Result { + let mut list = Vec::new(); + + loop { + if tokens.peek() == Some(&closer) { + tokens.next(); + break; + } + + if let Some(node) = next_statement(tokens)? { + list.push(node); + continue; + } + + match next_statement(tokens)? { + Some(node) => list.push(node), + None => match closer { + Token::RightParen => bail!("unclosed list"), + Token::RightBracket => bail!("unclosed vector"), + Token::RightBrace => bail!("unclosed hashmap"), + _ => bail!("unreachable"), + }, + } + } + + match closer { + Token::RightParen => Ok(Node::List(list)), + Token::RightBracket => Ok(Node::Vector(list)), + Token::RightBrace => Ok(Node::HashMap(list)), + + // This should theoretically be unreachable + _ => bail!( + "invalid collection type using closer {:?}. This is a bug; please file a bug report", + closer + ), + } +} + +fn read_quote(tokens: &mut Peekable>, quote_type: &str) -> Result { + let follower_node = match next_statement(tokens)? { + Some(node) => node, + None => bail!("quote does not have a valid follower node"), + }; + + Ok(Node::List(vec![ + Node::Symbol(quote_type.into()), + follower_node, + ])) +} + +#[cfg(test)] +mod test { + use crate::lexer; + + use super::*; + use rstest::rstest; + + #[rstest] + #[case("10", vec![ + Node::Int(10)])] + #[case(":owo", vec![ + Node::Keyword("owo".into())])] + #[case("\"uwu\"", vec![ + Node::String("uwu".into())])] + #[case("(10 2)", vec![ + Node::List(vec![ + Node::Int(10), + Node::Int(2)])])] + #[case("[10 2]", vec![ + Node::Vector(vec![ + Node::Int(10), + Node::Int(2)])])] + #[case("{10 2}", vec![ + Node::HashMap(vec![ + Node::Int(10), + Node::Int(2)])])] + #[case("(+ - * /)", vec![ + Node::List(vec![ + Node::Symbol("+".into()), + Node::Symbol("-".into()), + Node::Symbol("*".into()), + Node::Symbol("/".into())])])] + #[case("'(1 2 3)", vec![ + Node::List(vec![ + Node::Symbol("quote".into()), + Node::List(vec![ + Node::Int(1), + Node::Int(2), + Node::Int(3)])])])] + #[case("`(1 2 3)", vec![ + Node::List(vec![ + Node::Symbol("quasiquote".into()), + Node::List(vec![ + Node::Int(1), + Node::Int(2), + Node::Int(3)])])])] + #[case("~(1 2 3)", vec![ + Node::List(vec![ + Node::Symbol("unquote".into()), + Node::List(vec![ + Node::Int(1), + Node::Int(2), + Node::Int(3)])])])] + #[case("~@(1 2 3)", vec![ + Node::List(vec![ + Node::Symbol("splice-unquote".into()), + Node::List(vec![ + Node::Int(1), + Node::Int(2), + Node::Int(3)])])])] + #[case("(+ 1 2)", vec![ + Node::List(vec![ + Node::Symbol("+".into()), + Node::Int(1), + Node::Int(2)])])] + #[case("(+ 1 2 (- 1 2))", vec![ + Node::List(vec![ + Node::Symbol("+".into()), + Node::Int(1), + Node::Int(2), + Node::List(vec![ + Node::Symbol("-".into()), + Node::Int(1), + Node::Int(2)])])])] + fn test_parsing(#[case] input: &str, #[case] expected: Vec) { + let tokens = lexer::read(input).unwrap(); + let res = parse(tokens).unwrap(); + assert_eq!(res, expected); + } + + #[rstest] + #[case(")")] + #[case("]")] + #[case("}")] + #[case("(1 2")] + #[case("[1 2")] + #[case("{1 2")] + #[case("(1 2 '")] + #[case("(1 2 ')")] + #[case("(1 2 ~")] + #[case("(1 2 ~)")] + #[case("(1 2 `")] + #[case("(1 2 `)")] + #[case("(1 2 ~@")] + #[case("(1 2 ~@)")] + fn test_parsing_fail(#[case] input: &str) { + let tokens = lexer::read(input).unwrap(); + let res = parse(tokens); + assert!(res.is_err()); + } +}