Mathjax tags in html_backend.py
Question
I'm using html_content with MathJax to display mathematical formulas. However, when converting it to Markdown using the HTML_backend, the MathJax expressions aren't rendered correctly—some formulas written in LaTeX within MathJax tags disappear during the conversion.
<div>
<div>
<div>
<mjx-container><mjx-assistive-mml><math><mstyle><mrow><mo>[</mo><mtable><mtr><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd></mtr></mtable><mo>]</mo></mrow></mstyle></math></mjx-assistive-mml></mjx-container>
</div>
</div>
</div>
In markdown convert use HTML_backend:
Here is convert I need:
Could you please update the mathjax_tags in the HTML_backend to ensure that LaTeX formulas within MathJax tags are preserved during Markdown conversion?
Thank you
Let me look at this!
Let me look at this!
Here is my code I custom handle_mjx_container you can reference it. Becasue I can't optimizer in your code to merge request. Hope this code to speed up get content from mathjax tags:
from pylatexenc.latex2text import LatexNodes2Text
TAGS_FOR_NODE_ITEMS: Final = [
...,
"mjx-container"
]
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
self.handle_header(tag, doc)
...
elif tag.name in ["mjx-container"]:
self.handle_mathml_formula(tag, doc)
....
def handle_mathml_formula(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles MathML formula tags (mjx-container) by converting to LaTeX, including relevant next siblings."""
def convert_element(elem: PageElement) -> str:
if isinstance(elem, NavigableString):
return elem.strip() or ''
if not isinstance(elem, Tag):
return ''
tag = elem.name.split('}')[-1] if '}' in elem.name else elem.name
if tag == 'msqrt':
content = ''.join(convert_element(child) for child in elem.children)
return f'\\sqrt{{{content}}}'
elif tag == 'mfrac':
children = list(elem.children)
if len(children) != 2:
return "Error: <mfrac> must have exactly two children"
numerator = convert_element(children[0])
denominator = convert_element(children[1])
return f'\\frac{{{numerator}}}{{{denominator}}}'
elif tag == 'mrow':
content = ''.join(convert_element(child) for child in elem.children)
if len(list(elem.children)) > 1 and any(
isinstance(child, Tag) and (
child.name.endswith('mo') or (child.name and child.name.split('}')[-1] == 'mo'))
for child in elem.children
):
return f'{content}'
return content
elif tag == 'mtable':
rows = [
convert_element(child)
for child in elem.children
if isinstance(child, Tag) and (child.name.endswith('mtr') or child.name.split('}')[-1] == 'mtr')
]
cells = [row.split('|') for row in rows if row]
latex_rows = [' & '.join(row) for row in cells]
return '\\begin{pmatrix} ' + ' \\\\ '.join(latex_rows) + ' \\end{pmatrix}'
elif tag == 'mtr':
cells = [
convert_element(child)
for child in elem.children
if isinstance(child, Tag) and (child.name.endswith('mtd') or child.name.split('}')[-1] == 'mtd')
]
return '|'.join(cells)
elif tag == 'mtd':
return ''.join(convert_element(child) for child in elem.children)
elif tag == 'msub':
children = list(elem.children)
if len(children) != 2:
return "Error: <msub> must have exactly two children"
base = convert_element(children[0])
subscript = convert_element(children[1])
return f'{{{base}}}_{{{subscript}}}'
elif tag == 'msup':
children = list(elem.children)
if len(children) != 2:
return "Error: <msup> must have exactly two children"
base = convert_element(children[0])
superscript = convert_element(children[1])
return f'{{{base}}}^{{{superscript}}}'
elif tag in ('mn', 'mi', 'mo', 'math'):
return ''.join(convert_element(child) for child in elem.children)
else:
return ''.join(convert_element(child) for child in elem.children)
try:
# Initialize LaTeX content
latex_content = ""
math_tag = element.find("math")
if math_tag:
converted = convert_element(math_tag)
if converted and not converted.startswith("Error"):
latex_content += converted
else:
_log.error(f"Failed to convert MathML to LaTeX in main mjx-container: {converted}")
# Process next siblings of the <mjx-container>
for sibling in element.next_siblings:
if isinstance(sibling, NavigableString):
# Handle text nodes (e.g., operators or connectors)
text = sibling.strip()
if text:
latex_content += text
elif isinstance(sibling, Tag):
# Handle additional <mjx-container> or <math> tags
if sibling.name == "mjx-container":
math_tag = sibling.find("math")
if math_tag:
converted = convert_element(math_tag)
if converted and not converted.startswith("Error"):
latex_content += converted
else:
_log.error(f"Failed to convert MathML to LaTeX in sibling mjx-container: {converted}")
elif sibling.name == "math":
converted = convert_element(sibling)
if converted and not converted.startswith("Error"):
latex_content += converted
else:
_log.error(f"Failed to convert MathML to LaTeX in sibling math tag: {converted}")
# Validate and add the LaTeX content to the document
if not latex_content:
_log.error("Error: Empty MathML content after processing")
return
# Add the combined LaTeX content to the document
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=LatexNodes2Text().latex_to_text(latex_content),
content_layer=self.content_layer,
)
except Exception as exc:
_log.error(f"Error processing MathML formula: {repr(exc)}")
return