From becc4b4041aaf958da658c7635726a5b8ea1de1d Mon Sep 17 00:00:00 2001 From: Schrottkatze Date: Fri, 18 Oct 2024 14:05:27 +0200 Subject: [PATCH] json-pawarser: init --- Cargo.lock | 22 +++-- Cargo.toml | 4 +- crates/json-pawarser/Cargo.toml | 13 +++ crates/json-pawarser/src/grammar.rs | 67 +++++++++++++++ crates/json-pawarser/src/lib.rs | 3 + crates/json-pawarser/src/syntax_error.rs | 8 ++ crates/json-pawarser/src/syntax_kind.rs | 103 +++++++++++++++++++++++ crates/pawarser/src/lib.rs | 6 ++ crates/pawarser/src/parser.rs | 9 +- 9 files changed, 224 insertions(+), 11 deletions(-) create mode 100644 crates/json-pawarser/Cargo.toml create mode 100644 crates/json-pawarser/src/grammar.rs create mode 100644 crates/json-pawarser/src/lib.rs create mode 100644 crates/json-pawarser/src/syntax_error.rs create mode 100644 crates/json-pawarser/src/syntax_kind.rs diff --git a/Cargo.lock b/Cargo.lock index 4304154..a808278 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -804,6 +804,16 @@ dependencies = [ "rayon", ] +[[package]] +name = "json-pawarser" +version = "0.1.0" +dependencies = [ + "enumset", + "logos", + "pawarser", + "rowan", +] + [[package]] name = "lang" version = "0.1.0" @@ -881,18 +891,18 @@ checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "logos" -version = "0.14.0" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "161971eb88a0da7ae0c333e1063467c5b5727e7fb6b710b8db4814eade3a42e8" +checksum = "1c6b6e02facda28ca5fb8dbe4b152496ba3b1bd5a4b40bb2b1b2d8ad74e0f39b" dependencies = [ "logos-derive", ] [[package]] name = "logos-codegen" -version = "0.14.0" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e31badd9de5131fdf4921f6473d457e3dd85b11b7f091ceb50e4df7c3eeb12a" +checksum = "b32eb6b5f26efacd015b000bfc562186472cd9b34bdba3f6b264e2a052676d10" dependencies = [ "beef", "fnv", @@ -905,9 +915,9 @@ dependencies = [ [[package]] name = "logos-derive" -version = "0.14.0" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c2a69b3eb68d5bd595107c9ee58d7e07fe2bb5e360cc85b0f084dedac80de0a" +checksum = "3e5d0c5463c911ef55624739fc353238b4e310f0144be1f875dc42fec6bfd5ec" dependencies = [ "logos-codegen", ] diff --git a/Cargo.toml b/Cargo.toml index e5c6dc7..794dd5a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,9 @@ members = [ "crates/lang", "crates/svg-filters", "crates/prowocessing", - "crates/executor-poc", "crates/pawarser", + "crates/executor-poc", + "crates/pawarser", + "crates/json-pawarser", ] resolver = "2" diff --git a/crates/json-pawarser/Cargo.toml b/crates/json-pawarser/Cargo.toml new file mode 100644 index 0000000..eb342e9 --- /dev/null +++ b/crates/json-pawarser/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "json-pawarser" +version = "0.1.0" +edition = "2021" + +[dependencies] +logos = "0.14.2" +enumset = "1.1.3" +rowan = "0.15.15" +pawarser = { path = "../pawarser" } + +[lints] +workspace = true diff --git a/crates/json-pawarser/src/grammar.rs b/crates/json-pawarser/src/grammar.rs new file mode 100644 index 0000000..3ececd8 --- /dev/null +++ b/crates/json-pawarser/src/grammar.rs @@ -0,0 +1,67 @@ +use enumset::{enum_set, EnumSet}; + +use crate::{syntax_error::SyntaxError, syntax_kind::SyntaxKind}; + +use self::object::object; + +type Parser<'src, 'toks> = pawarser::Parser<'src, 'toks, SyntaxKind, SyntaxError>; +type CompletedMarker = pawarser::CompletedMarker; + +const BASIC_VALUE_TOKENS: EnumSet = + enum_set!(SyntaxKind::BOOL | SyntaxKind::NULL | SyntaxKind::NUMBER | SyntaxKind::STRING); + +pub fn value(p: &mut Parser) -> bool { + if BASIC_VALUE_TOKENS.contains(p.current()) { + p.do_bump(); + return true; + } else { + object(p).is_some() + } +} + +mod object { + use crate::{grammar::value, syntax_error::SyntaxError, syntax_kind::SyntaxKind}; + + use super::{CompletedMarker, Parser, BASIC_VALUE_TOKENS}; + + pub(super) fn object(p: &mut Parser) -> Option { + let obj_start = p.start("object"); + + if !p.at(SyntaxKind::BRACE_OPEN) { + obj_start.abandon(p); + return None; + } + + todo!() + } + + fn member(p: &mut Parser) -> Option { + let member_start = p.start("member"); + + if p.at(SyntaxKind::BRACE_CLOSE) { + member_start.abandon(p); + return None; + } else if p.at(SyntaxKind::STRING) { + let member_name_start = p.start("member_name"); + p.eat(SyntaxKind::STRING); + member_name_start.complete(p, SyntaxKind::MEMBER_NAME); + } else { + return todo!("handle other tokens"); + } + + if !p.eat(SyntaxKind::COLON) { + todo!("handle wrong tokens") + } + + if value(p) { + Some(member_start.complete(p, SyntaxKind::MEMBER)) + } else { + let e = member_start.error(p, SyntaxError::MemberMissingValue); + Some( + e.precede(p, "member but failed already") + .complete(p, SyntaxKind::MEMBER), + ) + } + } +} +mod array {} diff --git a/crates/json-pawarser/src/lib.rs b/crates/json-pawarser/src/lib.rs new file mode 100644 index 0000000..89160be --- /dev/null +++ b/crates/json-pawarser/src/lib.rs @@ -0,0 +1,3 @@ +mod grammar; +mod syntax_error; +mod syntax_kind; diff --git a/crates/json-pawarser/src/syntax_error.rs b/crates/json-pawarser/src/syntax_error.rs new file mode 100644 index 0000000..6ff9067 --- /dev/null +++ b/crates/json-pawarser/src/syntax_error.rs @@ -0,0 +1,8 @@ +use crate::syntax_kind::SyntaxKind; + +#[derive(Clone)] +pub enum SyntaxError { + DisallowedKeyType(SyntaxKind), + MemberMissingValue, +} +impl pawarser::parser::SyntaxError for SyntaxError {} diff --git a/crates/json-pawarser/src/syntax_kind.rs b/crates/json-pawarser/src/syntax_kind.rs new file mode 100644 index 0000000..9d3dc2c --- /dev/null +++ b/crates/json-pawarser/src/syntax_kind.rs @@ -0,0 +1,103 @@ +use logos::Logos; + +pub fn lex(src: &str) -> Vec<(SyntaxKind, &str)> { + let mut lex = SyntaxKind::lexer(src); + let mut r = Vec::new(); + + while let Some(tok_res) = lex.next() { + r.push((tok_res.unwrap_or(SyntaxKind::LEX_ERR), lex.slice())) + } + + r +} + +#[derive(enumset::EnumSetType, Debug, Logos, PartialEq, Eq, Clone, Copy, Hash)] +#[repr(u16)] +#[enumset(no_super_impls)] +#[allow(non_camel_case_types)] +pub enum SyntaxKind { + // Error SyntaxKinds + LEX_ERR, + PARSE_ERR, + + // Meta SyntaxKinds + TOMBSTONE, + EOF, + + OBJECT, + MEMBER, + MEMBER_NAME, + MEMBER_VALUE, + + ARRAY, + ELEMENT, + + // Tokens + // Regexes adapted from [the logos handbook](https://logos.maciej.codes/examples/json_borrowed.html) + #[token("true")] + #[token("false")] + BOOL, + #[token("{")] + BRACE_OPEN, + #[token("}")] + BRACE_CLOSE, + #[token("[")] + BRACKET_OPEN, + #[token("]")] + BRACKET_CLOSE, + #[token(":")] + COLON, + #[token(",")] + COMMA, + #[token("null")] + NULL, + #[regex(r"-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?")] + NUMBER, + #[regex(r#""([^"\\]|\\["\\bnfrt]|u[a-fA-F0-9]{4})*""#)] + STRING, + + // Whitespace tokens + #[regex("[ \\t\\f]+")] + WHITESPACE, + #[token("\n")] + NEWLINE, +} + +impl pawarser::parser::SyntaxElement for SyntaxKind { + const EOF: Self = Self::EOF; + + const ERROR: Self = Self::PARSE_ERR; +} + +impl From for rowan::SyntaxKind { + fn from(kind: SyntaxKind) -> Self { + Self(kind as u16) + } +} + +#[cfg(test)] +mod tests { + use crate::syntax_kind::{lex, SyntaxKind}; + + #[test] + fn simple_object() { + const TEST_DATA: &str = r#"{"hello_world": "meow", "some_num":7.42}"#; + + assert_eq!( + dbg!(lex(TEST_DATA)), + vec![ + (SyntaxKind::BRACE_OPEN, "{"), + (SyntaxKind::STRING, "\"hello_world\""), + (SyntaxKind::COLON, ":"), + (SyntaxKind::WHITESPACE, " "), + (SyntaxKind::STRING, "\"meow\""), + (SyntaxKind::COMMA, ","), + (SyntaxKind::WHITESPACE, " "), + (SyntaxKind::STRING, "\"some_num\""), + (SyntaxKind::COLON, ":"), + (SyntaxKind::NUMBER, "7.42"), + (SyntaxKind::BRACE_CLOSE, "}") + ] + ); + } +} diff --git a/crates/pawarser/src/lib.rs b/crates/pawarser/src/lib.rs index 92dcd54..26d8679 100644 --- a/crates/pawarser/src/lib.rs +++ b/crates/pawarser/src/lib.rs @@ -1,2 +1,8 @@ #![feature(iter_collect_into)] pub mod parser; + +pub use parser::{ + error::SyntaxError, + marker::{CompletedMarker, Marker}, + Parser, SyntaxElement, +}; diff --git a/crates/pawarser/src/parser.rs b/crates/pawarser/src/parser.rs index 0bac65a..09b1bf4 100644 --- a/crates/pawarser/src/parser.rs +++ b/crates/pawarser/src/parser.rs @@ -2,12 +2,13 @@ use std::cell::Cell; use enumset::{EnumSet, EnumSetType}; -use self::{error::SyntaxError, event::Event, input::Input, marker::Marker}; +use self::{event::Event, input::Input, marker::Marker}; +pub use error::SyntaxError; -mod error; +pub mod error; mod event; mod input; -mod marker; +pub mod marker; /// this is used to define some required SyntaxKinds like an EOF token or an error token pub trait SyntaxElement @@ -61,7 +62,7 @@ impl<'src, 'toks, SyntaxKind: SyntaxElement, SyntaxErr: SyntaxError> true } - fn do_bump(&mut self) { + pub fn do_bump(&mut self) { self.push_ev(Event::Eat { count: self.input.preceding_meaningless(self.pos), });