Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,13 @@ public class MarkdownParser extends AbstractEncodingDetectorParser {
private static final List<Extension> EXTENSIONS =
List.of(TablesExtension.create(), StrikethroughExtension.create());

//immutable and thread-safe
private static final Parser COMMONMARK = Parser.builder().extensions(EXTENSIONS).build();
//immutable and thread-safe.
//maxOpenBlockParsers caps block nesting: deeper blocks are parsed as flat paragraph text
//rather than nested structure, so a pathologically deep block document still extracts.
//Kept below SecureContentHandler's 100-level element-nesting cap so the flattened output
//stays under that limit and is emitted rather than rejected as a suspected zip bomb.
private static final Parser COMMONMARK =
Parser.builder().extensions(EXTENSIONS).maxOpenBlockParsers(64).build();

public MarkdownParser() {
super();
Expand All @@ -123,6 +128,9 @@ public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata
metadata.set(Metadata.CONTENT_TYPE, new MediaType(MARKDOWN, charset).toString());
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
document = COMMONMARK.parseReader(reader);
} catch (StackOverflowError e) {
//for reasons
throw new TikaException("Markdown is too deeply nested to parse", e);
}

XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,14 @@ public void testDataURIsBecomeEmbeddedDocuments() throws Exception {
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
}

@Test
public void testDeeplyNestedBlocksAreFlattenedNotFailed() throws Exception {
//maxOpenBlockParsers caps block nesting below SecureContentHandler's 100-level limit,
//so a pathologically deep block document extracts (deeper nesting flattened to text)
//rather than being rejected as a suspected zip bomb or overflowing the stack.
assertContains("deep", parseString("> ".repeat(5000) + "deep\n").xml);
}

@Test
public void testRoundTripsBackToMarkdown() throws Exception {
String markdown = "# Title\n\nSome **bold** and *italic* and ~~struck~~ `inline`.\n";
Expand Down
Loading