From c4041aefb66edad44e91c6620a7c9546b7bf1716 Mon Sep 17 00:00:00 2001 From: PoiScript Date: Fri, 17 May 2019 21:27:01 +0800 Subject: [PATCH] feat(parser): improve list parsing --- src/elements/clock.rs | 39 +++--- src/elements/fn_def.rs | 30 ++--- src/elements/list.rs | 269 +++++++++++++++++++---------------------- src/export/html.rs | 4 +- src/export/mod.rs | 4 +- src/parser.rs | 72 ++++++----- 6 files changed, 208 insertions(+), 210 deletions(-) diff --git a/src/elements/clock.rs b/src/elements/clock.rs index 73a9f10..69a1155 100644 --- a/src/elements/clock.rs +++ b/src/elements/clock.rs @@ -25,20 +25,23 @@ pub enum Clock<'a> { impl<'a> Clock<'a> { pub(crate) fn parse(text: &'a str) -> Option<(Clock<'a>, usize)> { - let (text, off) = memchr(b'\n', text.as_bytes()) + let (text, eol) = memchr(b'\n', text.as_bytes()) .map(|i| (text[..i].trim(), i + 1)) .unwrap_or_else(|| (text.trim(), text.len())); - let tail = memchr(b' ', text.as_bytes()) - .filter(|&i| &text[0..i] == "CLOCK:") - .map(|i| text[i..].trim_start())?; + if !text.starts_with("CLOCK:") { + return None; + } + + let tail = &text["CLOCK:".len()..].trim_start(); if !tail.starts_with('[') { return None; } - let (timestamp, tail) = - Timestamp::parse_inactive(tail).map(|(t, off)| (t, tail[off..].trim_start()))?; + let (timestamp, off) = Timestamp::parse_inactive(tail)?; + + let tail = tail[off..].trim(); match timestamp { Timestamp::InactiveRange { @@ -62,7 +65,7 @@ impl<'a> Clock<'a> { delay, duration, }, - off, + eol, )) } else { None @@ -72,20 +75,14 @@ impl<'a> Clock<'a> { start, repeater, delay, - } => { - if tail.as_bytes().iter().all(u8::is_ascii_whitespace) { - Some(( - Clock::Running { - start, - repeater, - delay, - }, - off, - )) - } else { - None - } - } + } if tail.is_empty() => Some(( + Clock::Running { + start, + repeater, + delay, + }, + eol, + )), _ => None, } } diff --git a/src/elements/fn_def.rs b/src/elements/fn_def.rs index 3c16183..0cfc603 100644 --- a/src/elements/fn_def.rs +++ b/src/elements/fn_def.rs @@ -2,22 +2,24 @@ use memchr::memchr; #[inline] pub fn parse(text: &str) -> Option<(&str, &str, usize)> { - debug_assert!(text.starts_with("[fn:")); + if text.starts_with("[fn:") { + let (label, off) = memchr(b']', text.as_bytes()) + .filter(|&i| { + i != 4 + && text.as_bytes()["[fn:".len()..i] + .iter() + .all(|&c| c.is_ascii_alphanumeric() || c == b'-' || c == b'_') + }) + .map(|i| (&text["[fn:".len()..i], i + 1))?; - let (label, off) = memchr(b']', text.as_bytes()) - .filter(|&i| { - i != 4 - && text.as_bytes()["[fn:".len()..i] - .iter() - .all(|&c| c.is_ascii_alphanumeric() || c == b'-' || c == b'_') - }) - .map(|i| (&text["[fn:".len()..i], i + 1))?; + let (content, off) = memchr(b'\n', text.as_bytes()) + .map(|i| (&text[off..i], i)) + .unwrap_or_else(|| (&text[off..], text.len())); - let (content, off) = memchr(b'\n', text.as_bytes()) - .map(|i| (&text[off..i], i)) - .unwrap_or_else(|| (&text[off..], text.len())); - - Some((label, content, off)) + Some((label, content, off)) + } else { + None + } } #[cfg(test)] diff --git a/src/elements/list.rs b/src/elements/list.rs index 2de5a0c..10676d2 100644 --- a/src/elements/list.rs +++ b/src/elements/list.rs @@ -1,16 +1,63 @@ use memchr::memchr_iter; +use std::iter::once; +// (indentation, ordered, limit, end) #[inline] -pub fn is_item(text: &str) -> Option<(bool, &str)> { - if text.is_empty() { - return None; +pub fn parse(text: &str) -> Option<(usize, bool, usize, usize)> { + let (indent, tail) = text + .find(|c| c != ' ') + .map(|off| (off, &text[off..])) + .unwrap_or((0, text)); + + let ordered = is_item(tail)?; + let bytes = text.as_bytes(); + let mut lines = memchr_iter(b'\n', bytes) + .map(|i| i + 1) + .chain(once(text.len())); + let mut pos = lines.next()?; + + while let Some(i) = lines.next() { + let line = &text[pos..i]; + return if let Some(line_indent) = line.find(|c: char| !c.is_whitespace()) { + // this line is no empty + if line_indent < indent + || (line_indent == indent && is_item(&line[line_indent..]).is_none()) + { + Some((indent, ordered, pos, pos)) + } else { + pos = i; + continue; + } + } else if let Some(next_i) = lines.next() { + // this line is empty + let line = &text[i..next_i]; + if let Some(line_indent) = line.find(|c: char| !c.is_whitespace()) { + if line_indent < indent + || (line_indent == indent && is_item(&line[line_indent..]).is_none()) + { + Some((indent, ordered, pos, pos)) + } else { + pos = next_i; + continue; + } + } else { + Some((indent, ordered, pos, next_i)) + } + } else { + Some((indent, ordered, pos, i)) + }; } + Some((indent, ordered, pos, pos)) +} + +#[inline] +pub fn is_item(text: &str) -> Option { let bytes = text.as_bytes(); - match bytes[0] { + match bytes.get(0)? { b'*' | b'-' | b'+' => { if text.len() > 1 && (bytes[1] == b' ' || bytes[1] == b'\n') { - Some((false, &text[0..2])) + Some(false) } else { None } @@ -21,10 +68,10 @@ pub fn is_item(text: &str) -> Option<(bool, &str)> { .position(|&c| !c.is_ascii_digit()) .unwrap_or_else(|| text.len() - 1); if (bytes[i] == b'.' || bytes[i] == b')') - && i + 1 < text.len() + && text.len() > i + 1 && (bytes[i + 1] == b' ' || bytes[i + 1] == b'\n') { - Some((true, &text[0..i + 2])) + Some(true) } else { None } @@ -33,141 +80,79 @@ pub fn is_item(text: &str) -> Option<(bool, &str)> { } } -// check if list item ends at this line -#[inline] -fn is_item_ends(line: &str, ident: usize) -> Option<&str> { - debug_assert!(!line.is_empty()); - - let line_ident = line - .as_bytes() - .iter() - .position(|&c| c != b' ' && c != b'\t') - .unwrap_or(0); - - debug_assert!(line_ident >= ident, "{} >= {}", line_ident, ident); - - if line_ident == ident { - is_item(&line[ident..]).map(|(_, bullet)| bullet) - } else { - None - } +#[test] +fn test_is_item() { + assert_eq!(is_item("+ item"), Some(false)); + assert_eq!(is_item("- item"), Some(false)); + assert_eq!(is_item("10. item"), Some(true)); + assert_eq!(is_item("10) item"), Some(true)); + assert_eq!(is_item("1. item"), Some(true)); + assert_eq!(is_item("1) item"), Some(true)); + assert_eq!(is_item("10. "), Some(true)); + assert_eq!(is_item("10.\n"), Some(true)); + assert_eq!(is_item("10."), None); + assert_eq!(is_item("+"), None); + assert_eq!(is_item("-item"), None); + assert_eq!(is_item("+item"), None); } -// return (limit, end, next item bullet) -#[inline] -pub fn parse(text: &str, ident: usize) -> (usize, usize, Option<&str>) { - let bytes = text.as_bytes(); - let mut lines = memchr_iter(b'\n', bytes); - let mut pos = if let Some(i) = lines.next() { - i + 1 - } else { - return (text.len(), text.len(), None); - }; - - while let Some(i) = lines.next() { - return if bytes[pos..i].iter().all(u8::is_ascii_whitespace) { - if let Some(nexti) = lines.next() { - if bytes[i + 1..nexti].iter().all(u8::is_ascii_whitespace) { - // two consecutive empty lines - (pos - 1, nexti + 1, None) - } else if let Some(next) = is_item_ends(&text[i + 1..nexti], ident) { - (pos - 1, i + 1, Some(next)) - } else { - pos = nexti + 1; - continue; - } - } else if bytes[i + 1..].iter().all(u8::is_ascii_whitespace) { - // two consecutive empty lines - (pos - 1, text.len(), None) - } else if let Some(next) = is_item_ends(&text[i + 1..], ident) { - (pos - 1, i + 1, Some(next)) - } else { - (text.len(), text.len(), None) - } - } else if let Some(next) = is_item_ends(&text[pos..i], ident) { - (pos - 1, pos, Some(next)) - } else { - pos = i + 1; - continue; - }; - } - - if bytes[pos..].iter().all(u8::is_ascii_whitespace) { - (pos - 1, text.len(), None) - } else if let Some(next) = is_item_ends(&text[pos..], ident) { - (pos - 1, pos, Some(next)) - } else { - (text.len(), text.len(), None) - } -} - -#[cfg(test)] -mod tests { - #[test] - fn is_item() { - use super::is_item; - - assert_eq!(is_item("+ item"), Some((false, "+ "))); - assert_eq!(is_item("- item"), Some((false, "- "))); - assert_eq!(is_item("10. item"), Some((true, "10. "))); - assert_eq!(is_item("10) item"), Some((true, "10) "))); - assert_eq!(is_item("1. item"), Some((true, "1. "))); - assert_eq!(is_item("1) item"), Some((true, "1) "))); - assert_eq!(is_item("10. "), Some((true, "10. "))); - assert_eq!(is_item("10.\n"), Some((true, "10.\n"))); - assert_eq!(is_item("10."), None); - assert_eq!(is_item("+"), None); - assert_eq!(is_item("-item"), None); - assert_eq!(is_item("+item"), None); - } - - #[test] - fn parse() { - use super::parse; - - assert_eq!( - parse("item1\n+ item2", 0), - ("item1".len(), "item1\n".len(), Some("+ ")) - ); - assert_eq!( - parse("item1\n \n* item2", 0), - ("item1".len(), "item1\n \n".len(), Some("* ")) - ); - assert_eq!( - parse("item1\n \n \n* item2", 0), - ("item1".len(), "item1\n \n \n".len(), None) - ); - assert_eq!( - parse("item1\n \n ", 0), - ("item1".len(), "item1\n \n ".len(), None) - ); - assert_eq!( - parse("item1\n + item2\n ", 0), - ( - "item1\n + item2".len(), - "item1\n + item2\n ".len(), - None - ) - ); - assert_eq!( - parse("item1\n \n + item2\n \n+ item 3", 0), - ( - "item1\n \n + item2".len(), - "item1\n \n + item2\n \n".len(), - Some("+ ") - ) - ); - assert_eq!( - parse("item1\n \n + item2", 2), - ("item1".len(), "item1\n \n".len(), Some("+ ")) - ); - assert_eq!( - parse("1\n\n - 2\n\n - 3\n\n+ 4", 0), - ( - "1\n\n - 2\n\n - 3".len(), - "1\n\n - 2\n\n - 3\n\n".len(), - Some("+ ") - ) - ); - } +#[test] +fn test_parse() { + assert_eq!( + parse("+ item1\n+ item2"), + Some((0, false, "+ item1\n+ item2".len(), "+ item1\n+ item2".len())) + ); + assert_eq!( + parse("* item1\n \n* item2"), + Some(( + 0, + false, + "* item1\n \n* item2".len(), + "* item1\n \n* item2".len() + )) + ); + assert_eq!( + parse("* item1\n \n \n* item2"), + Some((0, false, "* item1\n".len(), "* item1\n \n \n".len())) + ); + assert_eq!( + parse("* item1\n \n "), + Some((0, false, "+ item1\n".len(), "* item1\n \n ".len())) + ); + assert_eq!( + parse("+ item1\n + item2\n "), + Some(( + 0, + false, + "+ item1\n + item2\n".len(), + "+ item1\n + item2\n ".len() + )) + ); + assert_eq!( + parse("+ item1\n \n + item2\n \n+ item 3"), + Some(( + 0, + false, + "+ item1\n \n + item2\n \n+ item 3".len(), + "+ item1\n \n + item2\n \n+ item 3".len() + )) + ); + assert_eq!( + parse(" + item1\n \n + item2"), + Some(( + 2, + false, + " + item1\n \n + item2".len(), + " + item1\n \n + item2".len() + )) + ); + assert_eq!( + parse("+ 1\n\n - 2\n\n - 3\n\n+ 4"), + Some(( + 0, + false, + "+ 1\n\n - 2\n\n - 3\n\n+ 4".len(), + "+ 1\n\n - 2\n\n - 3\n\n+ 4".len() + )) + ); } diff --git a/src/export/html.rs b/src/export/html.rs index 4743eae..b23e79f 100644 --- a/src/export/html.rs +++ b/src/export/html.rs @@ -106,14 +106,14 @@ pub trait HtmlHandler> { fn dyn_block_end(&mut self, w: &mut W) -> Result<(), E> { Ok(()) } - fn list_beg(&mut self, w: &mut W, ordered: bool) -> Result<(), E> { + fn list_beg(&mut self, w: &mut W, _indent: usize, ordered: bool) -> Result<(), E> { if ordered { Ok(write!(w, "
    ")?) } else { Ok(write!(w, "
      ")?) } } - fn list_end(&mut self, w: &mut W, ordered: bool) -> Result<(), E> { + fn list_end(&mut self, w: &mut W, _indent: usize, ordered: bool) -> Result<(), E> { if ordered { Ok(write!(w, "
")?) } else { diff --git a/src/export/mod.rs b/src/export/mod.rs index 44266bb..d747c6e 100644 --- a/src/export/mod.rs +++ b/src/export/mod.rs @@ -25,8 +25,8 @@ macro_rules! handle_event { VerseBlock { cont, args } => $handler.verse_block($writer, cont, args)?, DynBlockBeg { name, args } => $handler.dyn_block_beg($writer, name, args)?, DynBlockEnd => $handler.dyn_block_end($writer)?, - ListBeg { ordered } => $handler.list_beg($writer, ordered)?, - ListEnd { ordered } => $handler.list_end($writer, ordered)?, + ListBeg { indent, ordered } => $handler.list_beg($writer, indent, ordered)?, + ListEnd { indent, ordered } => $handler.list_end($writer, indent, ordered)?, ListItemBeg { bullet } => $handler.list_beg_item($writer, bullet)?, ListItemEnd => $handler.list_end_item($writer)?, Call { value } => $handler.call($writer, value)?, diff --git a/src/parser.rs b/src/parser.rs index ae6516a..dd5ccf6 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -70,9 +70,11 @@ pub enum Event<'a> { }, ListBeg { + indent: usize, ordered: bool, }, ListEnd { + indent: usize, ordered: bool, }, ListItemBeg { @@ -138,7 +140,6 @@ pub enum Event<'a> { pub struct Parser<'a> { text: &'a str, stack: Vec<(Container, usize, usize)>, - next_item: Vec>, off: usize, ele_buf: Option<(Event<'a>, usize, usize, usize)>, obj_buf: Option<(Event<'a>, usize, usize, usize)>, @@ -151,7 +152,6 @@ impl<'a> Parser<'a> { Parser { text, stack: Vec::new(), - next_item: Vec::new(), off: 0, ele_buf: None, obj_buf: None, @@ -164,7 +164,6 @@ impl<'a> Parser<'a> { Parser { text, stack: Vec::new(), - next_item: Vec::new(), off: 0, ele_buf: None, obj_buf: None, @@ -191,7 +190,6 @@ impl<'a> Parser<'a> { pub fn set_text(&mut self, text: &'a str) { self.off = 0; self.stack.clear(); - self.next_item.clear(); self.ele_buf = None; self.obj_buf = None; self.text = text; @@ -208,7 +206,7 @@ impl<'a> Parser<'a> { Container::DynBlock => Event::DynBlockEnd, Container::Headline(_) => Event::HeadlineEnd, Container::Italic => Event::ItalicEnd, - Container::List(_, ordered) => Event::ListEnd { ordered }, + Container::List(indent, ordered) => Event::ListEnd { indent, ordered }, Container::ListItem => Event::ListItemEnd, Container::Paragraph => Event::ParagraphEnd, Container::QteBlock => Event::QteBlockEnd, @@ -300,8 +298,8 @@ impl<'a> Parser<'a> { Event::CtrBlockBeg => self.push_stack(Container::CtrBlock, limit, end), Event::SplBlockBeg { .. } => self.push_stack(Container::SplBlock, limit, end), Event::DynBlockBeg { .. } => self.push_stack(Container::DynBlock, limit, end), - Event::ListBeg { ordered, .. } => { - self.push_stack(Container::List(limit, ordered), end, end) + Event::ListBeg { ordered, indent } => { + self.push_stack(Container::List(indent, ordered), limit, end) } _ => (), } @@ -315,10 +313,10 @@ impl<'a> Parser<'a> { fn real_next_ele(&mut self, text: &'a str) -> Option<(Event<'a>, usize, usize, usize)> { debug_assert!(!text.starts_with('\n')); - if text.starts_with("[fn:") { - if let Some((label, cont, off)) = fn_def::parse(text) { - return Some((Event::FnDef { label, cont }, off + 1, 0, 0)); - } + if let Some((label, cont, off)) = fn_def::parse(text) { + return Some((Event::FnDef { label, cont }, off + 1, 0, 0)); + } else if let Some((indent, ordered, limit, end)) = list::parse(text) { + return Some((Event::ListBeg { indent, ordered }, 0, limit, end)); } let (tail, line_begin) = text @@ -326,15 +324,8 @@ impl<'a> Parser<'a> { .map(|off| (&text[off..], off)) .unwrap_or((text, 0)); - if let Some((ordered, bullet)) = list::is_item(tail) { - self.next_item.push(Some(bullet)); - return Some((Event::ListBeg { ordered }, 0, line_begin, text.len())); - } - - if tail.starts_with("CLOCK:") { - if let Some((clock, off)) = Clock::parse(tail) { - return Some((Event::Clock(clock), off + line_begin, 0, 0)); - } + if let Some((clock, off)) = Clock::parse(tail) { + return Some((Event::Clock(clock), off + line_begin, 0, 0)); } // TODO: LaTeX environment @@ -556,6 +547,31 @@ impl<'a> Parser<'a> { } } + fn next_list_item(&self, text: &'a str, indent: usize) -> (&'a str, usize, usize, usize) { + use std::iter::once; + + debug_assert!(&text[0..indent].trim().is_empty()); + let off = &text[indent..].find(' ').unwrap() + 1 + indent; + + let bytes = text.as_bytes(); + let mut lines = memchr_iter(b'\n', bytes) + .map(|i| i + 1) + .chain(once(text.len())); + let mut pos = lines.next().unwrap(); + + while let Some(i) = lines.next() { + let line = &text[pos..i]; + if let Some(line_indent) = line.find(|c: char| !c.is_whitespace()) { + if line_indent == indent { + return (&text[indent..off], off, pos, pos); + } + } + pos = i; + } + + (&text[indent..off], off, text.len(), text.len()) + } + #[inline] fn push_stack(&mut self, container: Container, limit: usize, end: usize) { self.stack @@ -572,7 +588,7 @@ impl<'a> Parser<'a> { Container::DynBlock => Event::DynBlockEnd, Container::Headline(_) => Event::HeadlineEnd, Container::Italic => Event::ItalicEnd, - Container::List(_, ordered) => Event::ListEnd { ordered }, + Container::List(indent, ordered) => Event::ListEnd { indent, ordered }, Container::ListItem => Event::ListItemEnd, Container::Paragraph => Event::ParagraphEnd, Container::QteBlock => Event::QteBlockEnd, @@ -602,7 +618,7 @@ impl<'a> Iterator for Parser<'a> { let tail = &self.text[self.off..limit]; - // eprintln!("{:?} {:?} {:?}", container, tail, self.next_item); + // eprintln!("{:?} {:?}", container, tail); Some(match container { Container::Headline(beg) => { @@ -646,18 +662,16 @@ impl<'a> Iterator for Parser<'a> { self.next_ele(tail) } } - Container::List(ident, ordered) => { - if let Some(bullet) = self.next_item.pop().unwrap() { - let off = bullet.len() + ident; - self.off += off; - let (limit, end, next) = list::parse(&tail[off..], ident); + Container::List(indent, ordered) => { + if self.off < limit { + let (bullet, off, limit, end) = self.next_list_item(tail, indent); self.push_stack(Container::ListItem, limit, end); - self.next_item.push(next); + self.off += off; Event::ListItemBeg { bullet } } else { self.off = end; self.stack.pop(); - Event::ListEnd { ordered } + Event::ListEnd { indent, ordered } } } Container::Paragraph