dataframe_image
dataframe_image copied to clipboard
Create async backend for playwright
Hi, I'm using dataframe_image for a project. It was working fine, but since the last week it stopped working with the default backend (I'm pretty sure that is due to an update of Chromium, because the same version of our project was working fine with an older version of Chromium, but it stops working after the update).
To fix that, we have been testing the playwright backend, but since we use async functions, the playwright backend does not work as it is. Playwright has an async API which is required for executing it in async code.
I'm not sure if having other class for async backends would be useful to other users of dataframe_image or if it's possible to just have a parameter in the Playwright backend to allow the execution of dataframe_image (and playwright) in async functions. But since I've written the code to allow this, I want to share it with you.
This would be the code for the base class:
class AsyncBrowserConverter(ABC):
MAX_IMAGE_SIZE = 65535
def __init__(
self,
center_df: bool = True,
max_rows: int | None = None,
max_cols: int | None = None,
chrome_path: str | None = None,
fontsize: int = 18,
encode_base64: bool = True,
limit_crop: bool = True,
device_scale_factor: int = 1,
use_mathjax: bool = False,
):
"""
Initialize the Html2ImageConverter class.
Args:
center_df (bool): Whether to center the dataframe. Default is True.
max_rows (int): Maximum number of rows. Default is None.
max_cols (int): Maximum number of columns. Default is None.
chrome_path (str): Path to the Chrome executable. Default is None.
fontsize (int): Font size. Default is 18.
encode_base64 (bool): Whether to encode the image in base64.Default is True.
limit_crop (bool): Whether to limit the crop. Default is True.
device_scale_factor (int): Device scale factor. Default is 1.
use_mathjax (bool): Whether to use MathJax for rendering. Default is False.
"""
self.center_df = center_df
self.max_rows = max_rows
self.max_cols = max_cols
self.chrome_path = chrome_path
self.fontsize = fontsize
self.encode_base64 = encode_base64
self.limit_crop = limit_crop
self.device_scale_factor = device_scale_factor
self.use_mathjax = use_mathjax
def get_css(self) -> str:
"""
Get the CSS for the HTML.
Returns:
str: The CSS string.
"""
from importlib.resources import files
import dataframe_image.converter.browser.static as browser_static
with open(files(browser_static).joinpath("style.css"), "r") as f:
css = "<style>" + f.read() + "</style>"
justify = "center" if self.center_df else "left"
css = css.format(fontsize=self.fontsize, justify=justify)
if self.use_mathjax:
script = """<script>
MathJax = {
tex: {
inlineMath: [['$', '$'], ['\\(', '\\)']]
},
svg: {
fontCache: 'global'
}
};
</script>
<script type="text/javascript" id="MathJax-script" async
src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-svg.js">
</script>
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>"""
css += script
return css
def should_enlarge(self, img: Image, ss_width: int, ss_height: int) -> tuple:
"""
Check if the image should be enlarged.
Args:
img (Image): The image to check.
ss_width (int): The screenshot width.
ss_height (int): The screenshot height.
Returns:
tuple: A tuple containing a boolean indicating whether to enlarge the image,
and the new width and height.
"""
enlarge = False
im_ndarray = np.array(img)
img2d = im_ndarray.mean(axis=2) == 255
all_white_vert = img2d.all(axis=0)
# must be all white for 30 pixels in a row to trigger stop
if all_white_vert[-30:].sum() != 30:
ss_width = int(ss_width * 1.5)
enlarge = True
all_white_horiz = img2d.all(axis=1)
if all_white_horiz[-30:].sum() != 30:
ss_height = int(ss_height * 1.5)
enlarge = True
return enlarge, ss_width, ss_height
@abstractmethod
async def screenshot(
self, html: str, ss_width: int = 1920, ss_height: int = 1080
) -> Image:
"""
Take a screenshot of the HTML.
Args:
html (str): The HTML to screenshot.
ss_width (int): The screenshot width. Default is 1920.
ss_height (int): The screenshot height. Default is 1080.
Returns:
Image: The screenshot image.
"""
raise NotImplementedError
def crop(self, im: Image) -> Image:
"""
Crop the image.
Args:
im (Image): The image to crop.
Returns:
Image: The cropped image.
"""
# remove black
imrgb = im.convert("RGB")
imageBox = imrgb.getbbox()
im = im.crop(imageBox)
# remove alpha channel
imrgb = im.convert("RGB")
# invert image (so that white is 0)
invert_im = ImageOps.invert(imrgb)
imageBox = invert_im.getbbox()
cropped = im.crop(imageBox)
return cropped
async def run(self, html: str) -> bytes:
"""
Run the converter on the HTML.
Args:
html (str): The HTML to convert.
Returns:
bytes: The converted image bytes.
"""
im = await self.screenshot(html)
temp_img = self.crop(im)
image_bytes = self.finalize_image(temp_img)
return image_bytes
def finalize_image(self, img: Image) -> bytes:
"""
Finalize the image.
Args:
img (Image): The image to finalize.
Returns:
bytes: The finalized image bytes.
"""
buffer = io.BytesIO()
img.save(buffer, format="png")
img_str = buffer.getvalue()
if self.encode_base64:
img_str = base64.b64encode(img_str).decode()
return img_str
def repr_png_wrapper(self):
from pandas.io.formats.style import Styler
ss = self
def _repr_png_(self):
if isinstance(self, Styler):
html = styler2html(self)
else:
html = self.to_html(
max_rows=ss.max_rows, max_cols=ss.max_cols, notebook=True
)
return ss.run(html)
return _repr_png_
This is the code for the Playwright async backend:
class AsyncPlayWrightConverter(AsyncBrowserConverter):
async def screenshot(self, html):
try:
from playwright.async_api import Error, async_playwright
except ImportError as ex:
raise ImportError(
"Playwright is not installed. Install it with 'pip install playwright' "
"and make sure you have a chromium browser installed."
) from ex
async with async_playwright() as p:
channels = ["chromium", "chrome", "msedge", None]
for c in channels:
try:
browser = await p.chromium.launch(
channel=c, args=["--disable-web-security"]
)
break
except Error:
pass
else:
raise Error(
"Could not find any chromium based browser. Make sure you have a "
"chromium browser installed. Or install it by "
"`playwright install chromium`."
)
context = await browser.new_context(
device_scale_factor=self.device_scale_factor, bypass_csp=True
)
page = await context.new_page()
await page.set_content(self.get_css() + html)
if self.use_mathjax:
mj = page.locator("mjx-container math")
try:
mj.wait_for(timeout=10000)
except Error:
logger.warning(
"MathJax did not render in time. Formula in dataframe may not "
"be rendered correctly."
)
pass
page.wait_for_timeout(200)
screenshot_bytes = await page.screenshot(full_page=True)
im = Image.open(io.BytesIO(screenshot_bytes))
return im
There are some minor changes (format and other things) to make it work in our project, but it could be used as a reference to implement this. Right now I cannot implement this and create a PR due to lack of time, but maybe in a few weeks I can do it if you think it's interesting to have it.
Thanks for sharing your idea. Can you provide a small example of using dataframe_image with AsyncPlayWrightConverter?
And the use case of why use async interface, maybe use it in a async web server or it can significantly increase the performance?
To use it I have created a function (its basically a copy of the dfi.export function):
async def dfi_export_async_playright(
obj: pd.DataFrame,
filename,
fontsize=14,
max_rows=None,
max_cols=None,
chrome_path=None,
dpi=None,
use_mathjax=False,
):
is_styler = isinstance(obj, Styler)
df = obj.data if is_styler else obj
converter = AsyncPlayWrightConverter(
max_rows=max_rows,
max_cols=max_cols,
chrome_path=chrome_path,
fontsize=fontsize,
encode_base64=False,
limit_crop=False,
device_scale_factor=(1 if dpi is None else dpi / 100.0),
use_mathjax=use_mathjax,
).run
if df.shape[0] > MAX_ROWS and max_rows is None:
error_msg = (
f"Your DataFrame has more than {MAX_ROWS} rows and will produce a huge "
"image file, possibly causing your computer to crash. Override this error "
"by explicitly setting `max_rows`. Use -1 for all rows."
)
if is_styler:
error_msg = (
f"Your Styled DataFrame has more than {MAX_ROWS} rows and will produce "
"a huge image file, possibly causing your computer to crash. Override "
"this error by explicitly setting `max_rows` to -1 for all columns. "
"Styled DataFrames are unable to select a subset of rows or columns "
"and therefore do not work with the `max_rows` and `max_cols` params"
)
raise ValueError(error_msg)
if df.shape[1] > MAX_COLS and max_cols is None:
error_msg = (
f"Your DataFrame has more than {MAX_COLS} columns and will produce a huge "
"image file, possibly causing your computer to crash. Override this error "
"by explicitly setting `max_cols`. Use -1 for all columns."
)
if is_styler:
error_msg = (
f"Your Styled DataFrame has more than {MAX_COLS} columns and will "
"produce a huge image file, possibly causing your computer to crash. "
"Override this error by explicitly setting `max_cols` to -1 for "
"all columns. Styled DataFrames are unable to select a subset of "
"rows or columns and therefore do not work with the `max_rows` "
"and `max_cols` parameters"
)
raise ValueError(error_msg)
if max_rows == -1:
max_rows = None
if max_cols == -1:
max_cols = None
if is_styler:
html = styler2html(obj)
else:
html = obj.to_html(max_rows=max_rows, max_cols=max_cols, notebook=True)
pre_limit = Image.MAX_IMAGE_PIXELS
Image.MAX_IMAGE_PIXELS = None
img_str = await converter(html)
# swap back to original value
Image.MAX_IMAGE_PIXELS = pre_limit
try:
with open(filename, "wb") as f:
f.write(img_str)
except TypeError as ex:
if hasattr(filename, "write"):
filename.write(img_str)
else:
raise ex
Then you could do something like this to run the tasks in parallel:
tasks = [dfi_export_async_playright(styled_df, filename) for styled_df, filename in my_config]
await asyncio.gather(*tasks)
In my case I didn't really need parallelism in the generation of images (it's already fast enough), but we had other processes (database queries, LLM calls, etc...) that we were executing in async functions, and playwright raised an error because you cannot call the playwright sync api from async functions (for other backends it worked well without having to do these things).
playwright raised an error because you cannot call the playwright sync api from async functions
I forget that limitation. I'll implement this asap.
Released in https://github.com/dexplo/dataframe_image/releases/tag/v0.2.5