Fix headline parsing leading to dropped lines.
Currently, headline parsing breaks the file into lines before parsing headlines, stripping terminal \n or \r\n. This prevents parse_headline_level from differentiating between end of line and end of file. This can lead to an edge case where a line is considered a headline for the purposes of stopping parsing the body of the previous, yet not a headline itself. This leads to parsing stopping there. If the file is immediately written, this results in truncating it. One example of this is `"* \n*\r\n* \n"`, which will be parsed identically to `"* \n"`.
This commit is contained in:
parent
5e4457e26e
commit
4fe0958843
132
src/parsers.rs
132
src/parsers.rs
|
@ -4,7 +4,12 @@ use std::marker::PhantomData;
|
||||||
use indextree::{Arena, NodeId};
|
use indextree::{Arena, NodeId};
|
||||||
use jetscii::{bytes, BytesConst};
|
use jetscii::{bytes, BytesConst};
|
||||||
use memchr::{memchr, memchr_iter};
|
use memchr::{memchr, memchr_iter};
|
||||||
use nom::bytes::complete::take_while1;
|
use nom::{
|
||||||
|
bytes::complete::is_a,
|
||||||
|
character::complete::one_of,
|
||||||
|
combinator::{map, verify},
|
||||||
|
IResult,
|
||||||
|
};
|
||||||
|
|
||||||
use crate::config::ParseConfig;
|
use crate::config::ParseConfig;
|
||||||
use crate::elements::{
|
use crate::elements::{
|
||||||
|
@ -635,23 +640,114 @@ pub fn blank_lines_count(input: &str) -> (&str, usize) {
|
||||||
crate::parse::combinators::blank_lines_count(input).unwrap_or((input, 0))
|
crate::parse::combinators::blank_lines_count(input).unwrap_or((input, 0))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_headline(input: &str) -> Option<(&str, (&str, usize))> {
|
// Matches a headline of level <= max_level. This will always be exactly one
|
||||||
let (input_, level) = parse_headline_level(input)?;
|
// line, including the terminal \n if one is present. Unlike org-mode (but like
|
||||||
let (input_, content) = lines_while(move |line| {
|
// org-element), we accept '\n' and EOF to terminate the stars. Returns the
|
||||||
parse_headline_level(line)
|
// number of stars. Must only be called at the start of a line.
|
||||||
.map(|(_, l)| l > level)
|
fn parse_headline_level_le(input: &str, max_level: usize) -> IResult<&str, usize, ()> {
|
||||||
.unwrap_or(true)
|
let (input, level) = verify(
|
||||||
})(input_)
|
map(is_a("*"), |s: &str| s.chars().count()),
|
||||||
.unwrap_or((input_, ""));
|
|level: &usize| *level <= max_level,
|
||||||
Some((input_, (&input[0..level + content.len()], level)))
|
)(input)?;
|
||||||
}
|
if !input.is_empty() {
|
||||||
|
one_of("\n ")(input)?;
|
||||||
pub fn parse_headline_level(input: &str) -> Option<(&str, usize)> {
|
let (input, _) = line_length(input)?;
|
||||||
let (input, stars) = take_while1::<_, _, ()>(|c: char| c == '*')(input).ok()?;
|
Ok((input, level))
|
||||||
|
|
||||||
if input.starts_with(' ') || input.starts_with('\n') || input.is_empty() {
|
|
||||||
Some((input, stars.len()))
|
|
||||||
} else {
|
} else {
|
||||||
None
|
Ok((input, level))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recognizes until end-of-line or end-of-input and returns the length of the
|
||||||
|
// line, including the terminal \n (or \r\n) if present.
|
||||||
|
fn line_length(input: &str) -> IResult<&str, usize, ()> {
|
||||||
|
match memchr(b'\n', input.as_bytes()) {
|
||||||
|
Some(index) => Ok((&input[index + 1..], index + 1)),
|
||||||
|
None => Ok(("", input.len())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_headline(input: &str) -> Option<(&str, (&str, usize))> {
|
||||||
|
// Consume the headline.
|
||||||
|
let (text, level) = parse_headline_level_le(input, std::usize::MAX).ok()?;
|
||||||
|
|
||||||
|
// Collect lines until EOF or a headline.
|
||||||
|
let mut last = 0;
|
||||||
|
for i in memchr_iter(b'\n', text.as_bytes()) {
|
||||||
|
if parse_headline_level_le(&text[last..], level).is_ok() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
last = i + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if last < text.len() && parse_headline_level_le(&text[last..], level).is_err() {
|
||||||
|
Some(("", (input, level)))
|
||||||
|
} else {
|
||||||
|
Some((
|
||||||
|
&text[last..],
|
||||||
|
(&input[..(input.len() - text.len()) + last], level),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_headline() {
|
||||||
|
assert_eq!(parse_headline("*"), Some(("", ("*", 1))));
|
||||||
|
assert_eq!(parse_headline("* "), Some(("", ("* ", 1))));
|
||||||
|
assert_eq!(parse_headline("* \r"), Some(("", ("* \r", 1))));
|
||||||
|
assert_eq!(parse_headline("*\t"), None);
|
||||||
|
assert_eq!(parse_headline("*\t\n"), None);
|
||||||
|
assert_eq!(parse_headline("* \n"), Some(("", ("* \n", 1))));
|
||||||
|
assert_eq!(parse_headline("* \n\r*"), Some(("", ("* \n\r*", 1))));
|
||||||
|
assert_eq!(parse_headline("* \n\r**"), Some(("", ("* \n\r**", 1))));
|
||||||
|
assert_eq!(parse_headline("*\n*"), Some(("*", ("*\n", 1))));
|
||||||
|
assert_eq!(parse_headline("*\n\n*"), Some(("*", ("*\n\n", 1))));
|
||||||
|
assert_eq!(parse_headline("*\r"), None);
|
||||||
|
assert_eq!(parse_headline("* *"), Some(("", ("* *", 1))));
|
||||||
|
assert_eq!(parse_headline("***\r** Hello\n"), None);
|
||||||
|
assert_eq!(
|
||||||
|
parse_headline("*** ** Hello\n"),
|
||||||
|
Some(("", ("*** ** Hello\n", 3)))
|
||||||
|
);
|
||||||
|
assert_eq!(parse_headline("* Hello"), Some(("", ("* Hello", 1))));
|
||||||
|
assert_eq!(
|
||||||
|
parse_headline("*** Hi\nWorld"),
|
||||||
|
Some(("", ("*** Hi\nWorld", 3)))
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
parse_headline("* Hello\nText\n** Test\n ** More text\n* World\n"),
|
||||||
|
Some(("* World\n", ("* Hello\nText\n** Test\n ** More text\n", 1)))
|
||||||
|
);
|
||||||
|
|
||||||
|
// We can parse a headline that contains the *\r\n. It is treated as
|
||||||
|
// text in the section.
|
||||||
|
assert_eq!(
|
||||||
|
parse_headline("* \n*\r\n* \n"),
|
||||||
|
Some(("* \n", ("* \n*\r\n", 1)))
|
||||||
|
);
|
||||||
|
|
||||||
|
// We can't parse a headline starting at *\r\n, thus ensuring that each
|
||||||
|
// line either is or is not a headline.
|
||||||
|
assert_eq!(parse_headline("*\r\n* \n"), None);
|
||||||
|
|
||||||
|
assert_eq!(parse_headline("* \n"), Some(("", ("* \n", 1))));
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
parse_headline("* \n**\r\n* \n"),
|
||||||
|
Some(("* \n", ("* \n**\r\n", 1)))
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(parse_headline("* a\n*"), Some(("*", ("* a\n", 1))));
|
||||||
|
assert_eq!(parse_headline("* a\r\n*"), Some(("*", ("* a\r\n", 1))));
|
||||||
|
assert_eq!(parse_headline("* a\r\n* b"), Some(("* b", ("* a\r\n", 1))));
|
||||||
|
assert_eq!(parse_headline("* a\n* "), Some(("* ", ("* a\n", 1))));
|
||||||
|
assert_eq!(parse_headline("* a\n* \n"), Some(("* \n", ("* a\n", 1))));
|
||||||
|
assert_eq!(parse_headline("* a\n* \n"), Some(("* \n", ("* a\n", 1))));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue