docling icon indicating copy to clipboard operation
docling copied to clipboard

Mathjax tags in html_backend.py

Open C0NGTRI123 opened this issue 8 months ago • 2 comments

Question

I'm using html_content with MathJax to display mathematical formulas. However, when converting it to Markdown using the HTML_backend, the MathJax expressions aren't rendered correctly—some formulas written in LaTeX within MathJax tags disappear during the conversion.

<div>
<div>
<div>
<mjx-container><mjx-assistive-mml><math><mstyle><mrow><mo>[</mo><mtable><mtr><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><msqrt><mn>6</mn></msqrt><mn>6</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>6</mn></msqrt><mi>i</mi></mrow><mn>6</mn></mfrac></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd></mtr><mtr><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mn>0</mn></mtd><mtd><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mo>−</mo><mfrac><mrow><msqrt><mn>3</mn></msqrt><mi>i</mi></mrow><mn>3</mn></mfrac></mtd><mtd><mfrac><msqrt><mn>3</mn></msqrt><mn>3</mn></mfrac></mtd></mtr></mtable><mo>]</mo></mrow></mstyle></math></mjx-assistive-mml></mjx-container>
</div>
</div>
</div>

In markdown convert use HTML_backend:


Here is convert I need: Image

Could you please update the mathjax_tags in the HTML_backend to ensure that LaTeX formulas within MathJax tags are preserved during Markdown conversion? Thank you

C0NGTRI123 avatar Apr 16 '25 08:04 C0NGTRI123

Let me look at this!

rickymaggio02 avatar May 21 '25 09:05 rickymaggio02

Let me look at this!

Here is my code I custom handle_mjx_container you can reference it. Becasue I can't optimizer in your code to merge request. Hope this code to speed up get content from mathjax tags:

from pylatexenc.latex2text import LatexNodes2Text

TAGS_FOR_NODE_ITEMS: Final = [
    ..., 
    "mjx-container"
]

def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
      if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
            self.handle_header(tag, doc)
      ...
      elif tag.name in ["mjx-container"]:
            self.handle_mathml_formula(tag, doc)
     ....

def handle_mathml_formula(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles MathML formula tags (mjx-container) by converting to LaTeX, including relevant next siblings."""

        def convert_element(elem: PageElement) -> str:
            if isinstance(elem, NavigableString):
                return elem.strip() or ''

            if not isinstance(elem, Tag):
                return ''

            tag = elem.name.split('}')[-1] if '}' in elem.name else elem.name

            if tag == 'msqrt':
                content = ''.join(convert_element(child) for child in elem.children)
                return f'\\sqrt{{{content}}}'
            elif tag == 'mfrac':
                children = list(elem.children)
                if len(children) != 2:
                    return "Error: <mfrac> must have exactly two children"
                numerator = convert_element(children[0])
                denominator = convert_element(children[1])
                return f'\\frac{{{numerator}}}{{{denominator}}}'
            elif tag == 'mrow':
                content = ''.join(convert_element(child) for child in elem.children)
                if len(list(elem.children)) > 1 and any(
                    isinstance(child, Tag) and (
                        child.name.endswith('mo') or (child.name and child.name.split('}')[-1] == 'mo'))
                    for child in elem.children
                ):
                    return f'{content}'
                return content
            elif tag == 'mtable':
                rows = [
                    convert_element(child)
                    for child in elem.children
                    if isinstance(child, Tag) and (child.name.endswith('mtr') or child.name.split('}')[-1] == 'mtr')
                ]
                cells = [row.split('|') for row in rows if row]
                latex_rows = [' & '.join(row) for row in cells]
                return '\\begin{pmatrix} ' + ' \\\\ '.join(latex_rows) + ' \\end{pmatrix}'
            elif tag == 'mtr':
                cells = [
                    convert_element(child)
                    for child in elem.children
                    if isinstance(child, Tag) and (child.name.endswith('mtd') or child.name.split('}')[-1] == 'mtd')
                ]
                return '|'.join(cells)
            elif tag == 'mtd':
                return ''.join(convert_element(child) for child in elem.children)
            elif tag == 'msub':
                children = list(elem.children)
                if len(children) != 2:
                    return "Error: <msub> must have exactly two children"
                base = convert_element(children[0])
                subscript = convert_element(children[1])
                return f'{{{base}}}_{{{subscript}}}'
            elif tag == 'msup':
                children = list(elem.children)
                if len(children) != 2:
                    return "Error: <msup> must have exactly two children"
                base = convert_element(children[0])
                superscript = convert_element(children[1])
                return f'{{{base}}}^{{{superscript}}}'
            elif tag in ('mn', 'mi', 'mo', 'math'):
                return ''.join(convert_element(child) for child in elem.children)
            else:
                return ''.join(convert_element(child) for child in elem.children)

        try:
            # Initialize LaTeX content
            latex_content = ""

            math_tag = element.find("math")
            if math_tag:
                converted = convert_element(math_tag)
                if converted and not converted.startswith("Error"):
                    latex_content += converted
                else:
                    _log.error(f"Failed to convert MathML to LaTeX in main mjx-container: {converted}")

            # Process next siblings of the <mjx-container>
            for sibling in element.next_siblings:
                if isinstance(sibling, NavigableString):
                    # Handle text nodes (e.g., operators or connectors)
                    text = sibling.strip()
                    if text:
                        latex_content += text
                elif isinstance(sibling, Tag):
                    # Handle additional <mjx-container> or <math> tags
                    if sibling.name == "mjx-container":
                        math_tag = sibling.find("math")
                        if math_tag:
                            converted = convert_element(math_tag)
                            if converted and not converted.startswith("Error"):
                                latex_content += converted
                            else:
                                _log.error(f"Failed to convert MathML to LaTeX in sibling mjx-container: {converted}")
                    elif sibling.name == "math":
                        converted = convert_element(sibling)
                        if converted and not converted.startswith("Error"):
                            latex_content += converted
                        else:
                            _log.error(f"Failed to convert MathML to LaTeX in sibling math tag: {converted}")

            # Validate and add the LaTeX content to the document
            if not latex_content:
                _log.error("Error: Empty MathML content after processing")
                return

            # Add the combined LaTeX content to the document
            doc.add_text(
                parent=self.parents[self.level],
                label=DocItemLabel.TEXT,
                text=LatexNodes2Text().latex_to_text(latex_content),
                content_layer=self.content_layer,
            )

        except Exception as exc:
            _log.error(f"Error processing MathML formula: {repr(exc)}")
            return

C0NGTRI123 avatar May 21 '25 10:05 C0NGTRI123