From a3ab844ba7f016e3650e2228a8381aa500ae2add Mon Sep 17 00:00:00 2001 From: Schrottkatze Date: Sun, 13 Oct 2024 15:32:26 +0200 Subject: [PATCH 1/3] pawarser(init): start extracting the parser lib --- Cargo.lock | 9 +++++++++ Cargo.toml | 2 +- crates/pawarser/Cargo.toml | 12 ++++++++++++ crates/pawarser/src/lib.rs | 39 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 crates/pawarser/Cargo.toml create mode 100644 crates/pawarser/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 25bef26..4304154 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1159,6 +1159,15 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pawarser" +version = "0.1.0" +dependencies = [ + "drop_bomb", + "enumset", + "rowan", +] + [[package]] name = "petgraph" version = "0.6.5" diff --git a/Cargo.toml b/Cargo.toml index 6b774ed..e5c6dc7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,7 @@ members = [ "crates/lang", "crates/svg-filters", "crates/prowocessing", - "crates/executor-poc", + "crates/executor-poc", "crates/pawarser", ] resolver = "2" diff --git a/crates/pawarser/Cargo.toml b/crates/pawarser/Cargo.toml new file mode 100644 index 0000000..787cb2f --- /dev/null +++ b/crates/pawarser/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "pawarser" +version = "0.1.0" +edition = "2021" + +[dependencies] +rowan = "0.15.15" +drop_bomb = "0.1.5" +enumset = "1.1.3" + +[lints] +workspace = true diff --git a/crates/pawarser/src/lib.rs b/crates/pawarser/src/lib.rs new file mode 100644 index 0000000..f1413cb --- /dev/null +++ b/crates/pawarser/src/lib.rs @@ -0,0 +1,39 @@ +#![feature(iter_collect_into)] +pub mod parser { + pub mod input { + use enumset::{EnumSet, EnumSetType}; + + struct Input<'src, 'toks, SyntaxKind: EnumSetType + Into> { + raw: &'toks Vec<(SyntaxKind, &'src str)>, + // enumset of meaningless tokens + semantically_meaningless: EnumSet, + // indices of non-meaningless tokens + meaningful_toks: Vec, + } + + impl<'src, 'toks, SyntaxKind: EnumSetType + Into> + Input<'src, 'toks, SyntaxKind> + { + pub fn new( + raw_toks: &'toks Vec<(SyntaxKind, &'src str)>, + meaningless: Option>, + ) -> Self { + let mut meaningful_toks = Vec::new(); + + if let Some(meaningless) = meaningless { + let meaningful_toks = raw_toks + .iter() + .enumerate() + .filter_map(|(i, tok)| (!meaningless.contains(tok.0)).then_some(i)) + .collect_into(&mut meaningful_toks); + } + + Self { + raw: raw_toks, + semantically_meaningless: meaningless.unwrap_or_default(), + meaningful_toks, + } + } + } + } +} From ec2ff5778b9161fe5805dade2c517177bd8f27ef Mon Sep 17 00:00:00 2001 From: Schrottkatze Date: Sun, 13 Oct 2024 16:44:59 +0200 Subject: [PATCH 2/3] pawarser(setup): basic parser stuff and types around it. also, a builder. --- crates/pawarser/src/lib.rs | 129 ++++++++++++++++++++++++++++++++++++- 1 file changed, 128 insertions(+), 1 deletion(-) diff --git a/crates/pawarser/src/lib.rs b/crates/pawarser/src/lib.rs index f1413cb..16955d1 100644 --- a/crates/pawarser/src/lib.rs +++ b/crates/pawarser/src/lib.rs @@ -1,9 +1,15 @@ #![feature(iter_collect_into)] pub mod parser { + use std::cell::Cell; + + use enumset::{EnumSet, EnumSetType}; + + use self::{error::SyntaxError, event::Event, input::Input}; + pub mod input { use enumset::{EnumSet, EnumSetType}; - struct Input<'src, 'toks, SyntaxKind: EnumSetType + Into> { + pub struct Input<'src, 'toks, SyntaxKind: EnumSetType + Into> { raw: &'toks Vec<(SyntaxKind, &'src str)>, // enumset of meaningless tokens semantically_meaningless: EnumSet, @@ -36,4 +42,125 @@ pub mod parser { } } } + mod event { + use enumset::EnumSetType; + + use super::error::SyntaxError; + + pub enum Event, SyntaxErr: SyntaxError> { + Start { + kind: NodeKind, + forward_parent: Option, + }, + Finish, + Eat { + count: usize, + }, + } + + impl, SyntaxErr: SyntaxError> + Event + { + pub fn tombstone() -> Self { + Self::Start { + kind: NodeKind::Tombstone, + forward_parent: None, + } + } + } + + pub enum NodeKind, SyntaxErr: SyntaxError> { + Tombstone, + Syntax(SyntaxKind), + Error(SyntaxErr), + } + + impl, SyntaxErr: SyntaxError> + NodeKind + { + pub fn is_tombstone(&self) -> bool { + matches!(self, Self::Tombstone) + } + pub fn is_syntax(&self) -> bool { + matches!(self, Self::Syntax(_)) + } + pub fn is_error(&self) -> bool { + matches!(self, Self::Error(_)) + } + } + } + mod error { + /// A marker trait... for now! + // TODO: constrain that conversion to `NodeKind::Error` is enforced to be possible + pub trait SyntaxError {} + } + + pub struct Parser< + 'src, + 'toks, + SyntaxKind: EnumSetType + Into, + SyntaxErr: SyntaxError, + > { + input: Input<'src, 'toks, SyntaxKind>, + pos: usize, + events: Vec>, + step_limit: u32, + steps: Cell, + } + + pub struct ParserBuilder< + 'src, + 'toks, + SyntaxKind: EnumSetType + Into, + // SyntaxErr: SyntaxError, + > { + raw_toks: &'toks Vec<(SyntaxKind, &'src str)>, + meaningless_token_kinds: EnumSet, + step_limit: u32, + } + + impl<'src, 'toks, SyntaxKind: EnumSetType + Into> + ParserBuilder<'src, 'toks, SyntaxKind> + { + pub fn new(raw_toks: &'toks Vec<(SyntaxKind, &'src str)>) -> Self { + Self { + raw_toks, + meaningless_token_kinds: EnumSet::new(), + step_limit: 4096, + } + } + + /// Sets the parser step limit. + /// Defaults to 4096 + pub fn step_limit(mut self, new: u32) -> Self { + self.step_limit = new; + self + } + + pub fn add_meaningless(mut self, kind: SyntaxKind) -> Self { + self.meaningless_token_kinds.insert(kind); + self + } + + pub fn add_meaningless_many(mut self, kind: Vec) -> Self { + self.meaningless_token_kinds + .insert_all(kind.into_iter().collect()); + self + } + + pub fn build(self) -> Parser<'src, 'toks, SyntaxKind, SyntaxErr> { + let Self { + raw_toks, + meaningless_token_kinds, + step_limit, + } = self; + Parser { + input: Input::new(raw_toks, Some(meaningless_token_kinds)), + pos: 0, + events: Vec::new(), + step_limit, + steps: Cell::new(0), + } + } + } } From 34ddaacb58d81d235f7f9ed2094ef600cdebc7e3 Mon Sep 17 00:00:00 2001 From: Schrottkatze Date: Sun, 13 Oct 2024 16:47:53 +0200 Subject: [PATCH 3/3] pawarser(chore): split up files --- crates/pawarser/src/lib.rs | 166 +--------------------------- crates/pawarser/src/parser.rs | 78 +++++++++++++ crates/pawarser/src/parser/error.rs | 3 + crates/pawarser/src/parser/event.rs | 45 ++++++++ crates/pawarser/src/parser/input.rs | 34 ++++++ 5 files changed, 161 insertions(+), 165 deletions(-) create mode 100644 crates/pawarser/src/parser.rs create mode 100644 crates/pawarser/src/parser/error.rs create mode 100644 crates/pawarser/src/parser/event.rs create mode 100644 crates/pawarser/src/parser/input.rs diff --git a/crates/pawarser/src/lib.rs b/crates/pawarser/src/lib.rs index 16955d1..92dcd54 100644 --- a/crates/pawarser/src/lib.rs +++ b/crates/pawarser/src/lib.rs @@ -1,166 +1,2 @@ #![feature(iter_collect_into)] -pub mod parser { - use std::cell::Cell; - - use enumset::{EnumSet, EnumSetType}; - - use self::{error::SyntaxError, event::Event, input::Input}; - - pub mod input { - use enumset::{EnumSet, EnumSetType}; - - pub struct Input<'src, 'toks, SyntaxKind: EnumSetType + Into> { - raw: &'toks Vec<(SyntaxKind, &'src str)>, - // enumset of meaningless tokens - semantically_meaningless: EnumSet, - // indices of non-meaningless tokens - meaningful_toks: Vec, - } - - impl<'src, 'toks, SyntaxKind: EnumSetType + Into> - Input<'src, 'toks, SyntaxKind> - { - pub fn new( - raw_toks: &'toks Vec<(SyntaxKind, &'src str)>, - meaningless: Option>, - ) -> Self { - let mut meaningful_toks = Vec::new(); - - if let Some(meaningless) = meaningless { - let meaningful_toks = raw_toks - .iter() - .enumerate() - .filter_map(|(i, tok)| (!meaningless.contains(tok.0)).then_some(i)) - .collect_into(&mut meaningful_toks); - } - - Self { - raw: raw_toks, - semantically_meaningless: meaningless.unwrap_or_default(), - meaningful_toks, - } - } - } - } - mod event { - use enumset::EnumSetType; - - use super::error::SyntaxError; - - pub enum Event, SyntaxErr: SyntaxError> { - Start { - kind: NodeKind, - forward_parent: Option, - }, - Finish, - Eat { - count: usize, - }, - } - - impl, SyntaxErr: SyntaxError> - Event - { - pub fn tombstone() -> Self { - Self::Start { - kind: NodeKind::Tombstone, - forward_parent: None, - } - } - } - - pub enum NodeKind, SyntaxErr: SyntaxError> { - Tombstone, - Syntax(SyntaxKind), - Error(SyntaxErr), - } - - impl, SyntaxErr: SyntaxError> - NodeKind - { - pub fn is_tombstone(&self) -> bool { - matches!(self, Self::Tombstone) - } - pub fn is_syntax(&self) -> bool { - matches!(self, Self::Syntax(_)) - } - pub fn is_error(&self) -> bool { - matches!(self, Self::Error(_)) - } - } - } - mod error { - /// A marker trait... for now! - // TODO: constrain that conversion to `NodeKind::Error` is enforced to be possible - pub trait SyntaxError {} - } - - pub struct Parser< - 'src, - 'toks, - SyntaxKind: EnumSetType + Into, - SyntaxErr: SyntaxError, - > { - input: Input<'src, 'toks, SyntaxKind>, - pos: usize, - events: Vec>, - step_limit: u32, - steps: Cell, - } - - pub struct ParserBuilder< - 'src, - 'toks, - SyntaxKind: EnumSetType + Into, - // SyntaxErr: SyntaxError, - > { - raw_toks: &'toks Vec<(SyntaxKind, &'src str)>, - meaningless_token_kinds: EnumSet, - step_limit: u32, - } - - impl<'src, 'toks, SyntaxKind: EnumSetType + Into> - ParserBuilder<'src, 'toks, SyntaxKind> - { - pub fn new(raw_toks: &'toks Vec<(SyntaxKind, &'src str)>) -> Self { - Self { - raw_toks, - meaningless_token_kinds: EnumSet::new(), - step_limit: 4096, - } - } - - /// Sets the parser step limit. - /// Defaults to 4096 - pub fn step_limit(mut self, new: u32) -> Self { - self.step_limit = new; - self - } - - pub fn add_meaningless(mut self, kind: SyntaxKind) -> Self { - self.meaningless_token_kinds.insert(kind); - self - } - - pub fn add_meaningless_many(mut self, kind: Vec) -> Self { - self.meaningless_token_kinds - .insert_all(kind.into_iter().collect()); - self - } - - pub fn build(self) -> Parser<'src, 'toks, SyntaxKind, SyntaxErr> { - let Self { - raw_toks, - meaningless_token_kinds, - step_limit, - } = self; - Parser { - input: Input::new(raw_toks, Some(meaningless_token_kinds)), - pos: 0, - events: Vec::new(), - step_limit, - steps: Cell::new(0), - } - } - } -} +pub mod parser; diff --git a/crates/pawarser/src/parser.rs b/crates/pawarser/src/parser.rs new file mode 100644 index 0000000..196c776 --- /dev/null +++ b/crates/pawarser/src/parser.rs @@ -0,0 +1,78 @@ +use std::cell::Cell; + +use enumset::{EnumSet, EnumSetType}; + +use self::{error::SyntaxError, event::Event, input::Input}; + +mod error; +mod event; +pub mod input; + +pub struct Parser< + 'src, + 'toks, + SyntaxKind: EnumSetType + Into, + SyntaxErr: SyntaxError, +> { + input: Input<'src, 'toks, SyntaxKind>, + pos: usize, + events: Vec>, + step_limit: u32, + steps: Cell, +} + +pub struct ParserBuilder< + 'src, + 'toks, + SyntaxKind: EnumSetType + Into, + // SyntaxErr: SyntaxError, +> { + raw_toks: &'toks Vec<(SyntaxKind, &'src str)>, + meaningless_token_kinds: EnumSet, + step_limit: u32, +} + +impl<'src, 'toks, SyntaxKind: EnumSetType + Into> + ParserBuilder<'src, 'toks, SyntaxKind> +{ + pub fn new(raw_toks: &'toks Vec<(SyntaxKind, &'src str)>) -> Self { + Self { + raw_toks, + meaningless_token_kinds: EnumSet::new(), + step_limit: 4096, + } + } + + /// Sets the parser step limit. + /// Defaults to 4096 + pub fn step_limit(mut self, new: u32) -> Self { + self.step_limit = new; + self + } + + pub fn add_meaningless(mut self, kind: SyntaxKind) -> Self { + self.meaningless_token_kinds.insert(kind); + self + } + + pub fn add_meaningless_many(mut self, kind: Vec) -> Self { + self.meaningless_token_kinds + .insert_all(kind.into_iter().collect()); + self + } + + pub fn build(self) -> Parser<'src, 'toks, SyntaxKind, SyntaxErr> { + let Self { + raw_toks, + meaningless_token_kinds, + step_limit, + } = self; + Parser { + input: Input::new(raw_toks, Some(meaningless_token_kinds)), + pos: 0, + events: Vec::new(), + step_limit, + steps: Cell::new(0), + } + } +} diff --git a/crates/pawarser/src/parser/error.rs b/crates/pawarser/src/parser/error.rs new file mode 100644 index 0000000..e27c536 --- /dev/null +++ b/crates/pawarser/src/parser/error.rs @@ -0,0 +1,3 @@ +/// A marker trait... for now! +// TODO: constrain that conversion to `NodeKind::Error` is enforced to be possible +pub trait SyntaxError {} diff --git a/crates/pawarser/src/parser/event.rs b/crates/pawarser/src/parser/event.rs new file mode 100644 index 0000000..cb1bed7 --- /dev/null +++ b/crates/pawarser/src/parser/event.rs @@ -0,0 +1,45 @@ +use enumset::EnumSetType; + +use super::error::SyntaxError; + +pub enum Event, SyntaxErr: SyntaxError> { + Start { + kind: NodeKind, + forward_parent: Option, + }, + Finish, + Eat { + count: usize, + }, +} + +impl, SyntaxErr: SyntaxError> + Event +{ + pub fn tombstone() -> Self { + Self::Start { + kind: NodeKind::Tombstone, + forward_parent: None, + } + } +} + +pub enum NodeKind, SyntaxErr: SyntaxError> { + Tombstone, + Syntax(SyntaxKind), + Error(SyntaxErr), +} + +impl, SyntaxErr: SyntaxError> + NodeKind +{ + pub fn is_tombstone(&self) -> bool { + matches!(self, Self::Tombstone) + } + pub fn is_syntax(&self) -> bool { + matches!(self, Self::Syntax(_)) + } + pub fn is_error(&self) -> bool { + matches!(self, Self::Error(_)) + } +} diff --git a/crates/pawarser/src/parser/input.rs b/crates/pawarser/src/parser/input.rs new file mode 100644 index 0000000..b148497 --- /dev/null +++ b/crates/pawarser/src/parser/input.rs @@ -0,0 +1,34 @@ +use enumset::{EnumSet, EnumSetType}; + +pub struct Input<'src, 'toks, SyntaxKind: EnumSetType + Into> { + raw: &'toks Vec<(SyntaxKind, &'src str)>, + // enumset of meaningless tokens + semantically_meaningless: EnumSet, + // indices of non-meaningless tokens + meaningful_toks: Vec, +} + +impl<'src, 'toks, SyntaxKind: EnumSetType + Into> + Input<'src, 'toks, SyntaxKind> +{ + pub fn new( + raw_toks: &'toks Vec<(SyntaxKind, &'src str)>, + meaningless: Option>, + ) -> Self { + let mut meaningful_toks = Vec::new(); + + if let Some(meaningless) = meaningless { + let meaningful_toks = raw_toks + .iter() + .enumerate() + .filter_map(|(i, tok)| (!meaningless.contains(tok.0)).then_some(i)) + .collect_into(&mut meaningful_toks); + } + + Self { + raw: raw_toks, + semantically_meaningless: meaningless.unwrap_or_default(), + meaningful_toks, + } + } +}