pandoc-reference-filter Allow for formatting in captions

Currently pandoc-reference-filter strips all formatting from captions. This is undesirable, but fixing it requires some significant changes to figure_replacement(), treating LaTeX, html, and markdown all differently.

I have a somewhat hackish proof-of-concept fix for this, which is built on top of (but mostly replaces) my work on adding support for short captions (in a pending pull request), though it should be relatively easy to remove the short caption stuff from this proof-of-concept if that were desirable. I've pasted the patch text below. (If you want, I can create a pull request, but I'm not sure this is ready. It's in the Format-Captions branch of my fork if you want to look there.)

Currently .docx captions aren't showing up at all (though the image is), for reasons I can't figure out. Any help with that would be appreciated. Also, it's not well tested, so expect bugs.

The question is: is this wanted? If so, is this the right approach, or does anyone have better ideas?

 internalreferences.py | 165 ++++++++++++++++++++++++++++----------------------
 1 file changed, 94 insertions(+), 71 deletions(-)

diff --git a/internalreferences.py b/internalreferences.py
index 5e85073..63f3a3f 100755
--- a/internalreferences.py
+++ b/internalreferences.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 import re
 from collections import OrderedDict
+from subprocess import Popen, PIPE

 import pandocfilters as pf

@@ -127,6 +128,76 @@ def create_figures(key, value, format, metadata):
     else:
         return None

+def toFormat(string, format):
+    # Process string through pandoc to get formatted string. Is there a better way?
+    p1 = Popen(['echo'] + string.split(), stdout=PIPE)
+    p2 = Popen(['pandoc', '-t', format], stdin=p1.stdout, stdout=PIPE)
+    p1.stdout.close()
+    return p2.communicate()[0].strip('\n')
+
+def latex_figure(attr, filename, caption, alt):
+    beginText = (u'\n'
+               '\\begin{{figure}}[htbp]\n'
+               '\\centering\n'
+               '\\includegraphics{{{filename}}}\n'.format(
+                                           filename=filename
+                                           ).encode('utf-8'))
+    endText = (u'}}\n'
+               '\\label{{{attr.id}}}\n'
+               '\\end{{figure}}\n'.format(attr=attr))
+    star = False
+    if 'unnumbered' in attr.classes:
+        beginText += '\\caption*{'
+        star = True
+    if alt and not star:
+        shortCaption = toFormat(alt, 'latex')
+        beginText += '\\caption['
+        latexFigure = [RawInline('latex', beginText)]
+        latexFigure += [RawInline('latex', shortCaption + ']{')] 
+    
+    else: # No short caption
+        if star: beginText += '\\caption*{'
+        else: beginText += '\\caption{'
+        latexFigure = [RawInline('latex', beginText + '{')]
+
+    latexFigure += caption
+    latexFigure += [RawInline('latex', endText)]
+    return pf.Para(latexFigure)
+
+def html_figure(attr, filename, fcaption, alt):
+    beginText = (u'\n'
+                  '<div {attr.html}>\n'
+                  '<img src="{filename}" alt="{alt}" />\n'
+                  '<p class="caption">').format(attr=attr,
+                                                filename=filename,
+                                                alt=alt)
+    endText = (u'</p>\n'
+                '</div>\n')
+    htmlFigure = [RawInline('html', beginText)]
+    htmlFigure += fcaption
+    htmlFigure += [RawInline('html', endText)]
+    return pf.Plain(htmlFigure)
+
+def html5_figure(attr, filename, fcaption, alt):
+    beginText = (u'\n'
+                   '<figure {attr.html}>\n'
+                   '<img src="{filename}" alt="{alt}" />\n'
+                   '<figcaption>').format(attr=attr,
+                                          filename=filename,
+                                          alt=alt)
+    endText = u'</figcaption>\n</figure>\n'
+    htmlFigure = [RawInline('html5', beginText)]
+    htmlFigure += fcaption
+    htmlFigure += [RawInline('html5', endText)]
+    return pf.Plain(htmlFigure)
+
+def markdown_figure(attr, filename, fcaption, alt):
+    beginText = u'<div {attr.html}>'.format(attr=attr)
+    endText = u'</div>'
+    markdownFigure = [pf.Para([pf.RawInline('html', beginText)])]
+    markdownFigure += [pf.Para([pf.Image(fcaption, (filename,alt))])]
+    markdownFigure += [pf.Para([pf.RawInline('html', endText)])]
+    return markdownFigure

 class ReferenceManager(object):
     """Internal reference manager.
@@ -139,32 +210,6 @@ class ReferenceManager(object):
     text of any given internal reference (no need for e.g. 'fig:' at
     the start of labels).
     """
-    figure_styles = {
-        'latex': (u'\n'
-                   '\\begin{{figure}}[htbp]\n'
-                   '\\centering\n'
-                   '\\includegraphics{{{filename}}}\n'
-                   '\\caption{star}{{{caption}}}\n'
-                   '\\label{{{attr.id}}}\n'
-                   '\\end{{figure}}\n'),
-
-        'html': (u'\n'
-                  '<div {attr.html}>\n'
-                  '<img src="{filename}" alt="{alt}" />'
-                  '<p class="caption">{fcaption}</p>\n'
-                  '</div>\n'),
-
-        'html5': (u'\n'
-                   '<figure {attr.html}>\n'
-                   '<img src="{filename}" alt="{alt}" />\n'
-                   '<figcaption>{fcaption}</figcaption>\n'
-                   '</figure>\n'),
-
-        'markdown': (u'\n'
-                      '<div {attr.html}>\n'
-                      '![{fcaption}]({filename})\n'
-                      '\n'
-                      '</div>\n')}

     latex_multi_autolink = u'\\cref{{{labels}}}{post}'

@@ -243,7 +288,7 @@ class ReferenceManager(object):
         """If the key, value represents a figure, append reference
         data to internal state.
         """
-        _caption, (filename, target), (id, classes, kvs) = value
+        _caption, (filename, alt), (id, classes, kvs) = value
         if 'unnumbered' in classes:
             return
         else:
@@ -278,7 +323,7 @@ class ReferenceManager(object):
         self.references[label] = {'type': 'math',
                                   'id': self.equation_count,
                                   'label': label}
-
+        
     def figure_replacement(self, key, value, format, metadata):
         """Replace figures with appropriate representation.

@@ -288,58 +333,36 @@ class ReferenceManager(object):
         The other way of doing it would be to pull out a '\label{(.*)}'
         from the caption of an Image and use that to update the references.
         """
-        _caption, (filename, target), attrs = value
-#        caption = pf.stringify(_caption)
-        caption = _caption
+        caption, (filename, alt), attrs = value
+        if format == 'latex': alt = toFormat(str(alt), format)  # Preserve formatting
+#        else: alt = pf.stringify(alt)

         attr = PandocAttributes(attrs)

         if 'unnumbered' in attr.classes:
-            star = '*'
             fcaption = caption
         else:
             ref = self.references[attr.id]
             star = ''
             if caption:
-                fcaption = u'Figure {n}: {caption}'.format(n=ref['id'],
-                                                           caption=caption)
+                fcaption = [pf.Str(u'Figure {n}: '.format(n=ref['id']))] + caption
             else:
-                fcaption = u'Figure {n}'.format(n=ref['id'])
+                fcaption = [pf.Str(u'Figure {n}'.format(n=ref['id']))]

         if 'figure' not in attr.classes:
             attr.classes.insert(0, 'figure')
-
-        if format in self.formats:
-#            figure = self.figure_styles[format].format(attr=attr,
-#                                                       filename=filename,
-#                                                       alt=fcaption,
-#                                                       fcaption=fcaption,
-#                                                       caption=caption,
-#                                                       star=star).encode('utf-8')
-
-#            return RawBlock(format, figure)
-            beginText = (u'\n'
-                   '\\begin{{figure}}[htbp]\n'
-                   '\\centering\n'
-                   '\\includegraphics{{{filename}}}\n'
-                   '\\caption{star}{{'.format(filename=filename,
-                                               star=star).encode('utf-8'))
-            endText = (u'}}\n'
-                   '\\label{{{attr.id}}}\n'
-                   '\\end{{figure}}\n'.format(attr=attr))
-            begin = RawBlock('latex', beginText)
-            end = RawBlock('latex', endText)
-            all = [begin, pf.Str('hello'), end]
-            return [begin] + [pf.Plain(caption)] + [end]
-            # Convert from: {"t":"Figure", "c":[[{"t":"Str","c":"CAPTION"}],["FIGURE.JPG","TITLE"],"{#REFERENCE}"]}
-            # to: {"t": "RawBlock", "c": }
-
+        
+        if format == 'latex': return latex_figure(attr, filename, caption, alt)
+        elif format == 'html': return html_figure(attr, filename, fcaption, alt)
+        elif format == 'html5': return html5_figure(attr, filename, fcaption, alt)
+        elif format == 'markdown': return markdown_figure(attr, filename, fcaption, alt)
         else:
-            alt = [pf.Str(fcaption)]
-            target = (filename, '')
-            image = pf.Image(alt, target)
-            figure = pf.Para([image])
-            return pf.Div(attr.to_pandoc(), [figure])
+#            # FIXME: docx export fails to include the caption!
+#            fcaption = pf.stringify(fcaption)
+#            fcaption = [pf.Str(str(caption))]
+            image = pf.Image(fcaption, [filename, ''])
+            return pf.Plain([image])
+#            return pf.Para([image])

     def section_replacement(self, key, value, format, metadata):
         """Replace sections with appropriate representation.
@@ -406,8 +429,9 @@ class ReferenceManager(object):
         else:
             citation = citations[0]

-        prefix = citation['citationPrefix']
+        prefix = citation['citationPrefix'] + [pf.Space()]
         suffix = citation['citationSuffix']
+        

         label = citation['citationId']

@@ -426,10 +450,9 @@ class ReferenceManager(object):
             link = pf.RawInline('latex', '\\ref{{{label}}}'.format(label=label))
             return prefix + [link] + suffix

-        else: # FIXME! -- This must be the HTML case.
-            link_text = '{}{}{}'.format(prefix, text, suffix)
-            link = pf.Link([pf.Str(link_text)], ('#' + label, ''))
-            return link
+        else:
+            link = pf.Link([pf.Str(text)], ('#' + label, ''))
+            return prefix + [link] + suffix

     def convert_multiref(self, key, value, format, metadata):
         """Convert all internal links from '#blah' into format
-- 
2.2.1

Aug 27 '15 01:08 bwhelm

@bwhelm by formatting, are you meaning that the filter doesn't convert markdown captions into the target format? (Only as plain text). This is a limitation.

Can you submit this as a PR? It would make review easier.

Aug 27 '15 09:08 aaren

Submitted: https://github.com/aaren/pandoc-reference-filter/pull/17

Aug 27 '15 12:08 bwhelm