Docx-Viewer icon indicating copy to clipboard operation
Docx-Viewer copied to clipboard

Formatting set by style classes is missing (fix inside)

Open vtempest opened this issue 1 year ago • 1 comments

Here's the fix, you have to unzip both document and styles.xml and then interpolate

import JSZip from "jszip";
import { Parser } from "htmlparser2";

import * as tokens from "./docx-tokens";

/**
 * Converts docx file to array or token objects with text and formatting
 *  1 - open document.xml and styles.xml by unzipping .docx file
 *  2 - tokenize document.xml and pull info on named styles from styles.xml
 * @param {string|File|Blob|ArrayBuffer} docxInput - File Path, File object, Blob, or ArrayBuffer of a DOCX file
 * @param {object} options - { simplified: boolean }
 * @returns {Promise<Array>} Array of token objects with text and formatting
 */
export async function documentToTokens(docxInput, options = {}) {
  if (!docxInput) return;

  let arrayBuffer;

  try {
    if (typeof docxInput === 'string') {
      // If it's a URL
      if (docxInput.startsWith('http') || docxInput.startsWith('https')) {
        const response = await fetch(docxInput);
        arrayBuffer = await response.arrayBuffer();
      } else {
        // If it's a local file path, this won't work in the browser.
        // You'll need to handle file input differently in a browser environment.
        throw new Error("Local file paths are not supported in browser environments");
      }
    } else if (docxInput instanceof File || docxInput instanceof Blob) {
      arrayBuffer = await docxInput.arrayBuffer();
    } else if (docxInput instanceof ArrayBuffer) {
      arrayBuffer = docxInput;
    } else {
      throw new Error("Unsupported input type");
    }

    const zip = new JSZip();
    await zip.loadAsync(arrayBuffer);

    const styleXML = await zip.file("word/styles.xml").async("string");
    const docXML = await zip.file("word/document.xml").async("string");

    const styleData = await createStyleParser(styleXML);
    const blocks = await createTokenizer(docXML, styleData);

    if (options.simplified) {
      return blocks.map(tokens.simplifyTokens);
    }
    return blocks;
  } catch (e) {
    console.error("Error processing DOCX file:", e);
    throw e;
  }
}

/** 
  1 - open document.xml
  2 - tokenize xml
  3 - reconstruct cleaned html
*/
export async function  documentToMarkup  (filepath) {
  const docTokens = await documentToTokens(filepath);
  return tokens.tokensToMarkup(docTokens);
};


export async function createStyleParser (styleXML) {
  const parsedStyles = {};
  let styleName = "";
  return await new Promise((resolve, reject) => {
    var parser = new Parser(
      {
        onopentag(name, attributes) {
          if (name === "w:style") {
            styleName = attributes["w:styleId"];
            parsedStyles[styleName] = {
              underline: false,
              strong: false,
              mark: false,
            };
          }

          if (styleName) {
            var styles = parsedStyles[styleName];

            if (name === "w:u")
              styles.underline = attributes["w:val"] !== "none";
            else if (
              name === "w:highlight" ||
              styleName.toLowerCase().includes("highli")
            )
              styles.mark = true;
            else if (name === "w:b")
              styles.strong = attributes["w:val"] !== "0";
          }
        },
        onend: () => resolve(parsedStyles),
        onerror: reject,
      },
      { xmlMode: true }
    );

    parser.write(styleXML);

    parser.end();
  });
};


/**
 * Parses doc xml to tokenize each text range into
 * {text: "", format: { underline, strong, mark }
 * @param {string} docXML string from docx unzip
 * @param {object} styleData parsed object of style class names
 * @returns {array}  blocks[]
 */
export async function createTokenizer (docXML, styleData) {
  const blocks = [];
  let block;
  let token;
  return await new Promise((resolve, reject) => {
    var parser = new Parser(
      {
        onopentag(name, attributes) {
          if (name === "w:p") block = { format: "text", tokens: [] };
          else if (name === "w:pStyle")
            block.format = tokens.getStyleNameByXml(attributes["w:val"]);
          else if (name === "w:outlineLvl")
            block.format = tokens.getOutlineLvlName(+attributes["w:val"] + 1);
          else if (name === "w:r")
            token = {
              text: "",
              format: { underline: false, strong: false, mark: false },
            };
          else if (token) {
            if (name === "w:rStyle")
              token.format = { ...styleData[attributes["w:val"]] };

            if (name === "w:u")
              token.format.underline = attributes["w:val"] !== "none";
            else if (name === "w:highlight") token.format.mark = true;
            else if (name === "w:b")
              token.format.strong = attributes["w:val"] !== "0";
          }
        },
        ontext(data) {
          if (token) token.text += data;
        },
        onclosetag(name) {
          if (name === "w:p" && block.tokens.length) blocks.push(block);
          else if (name === "w:r" && token.text) block.tokens.push(token);
        },
        onend: () => resolve(blocks),
        onerror: reject,
      },
      { xmlMode: true }
    );

    parser.write(docXML);

    parser.end();
  });
};

`

`export function getStyleNameByXml(elXmlName) {
  const predicate = ({ xmlName = null }) => elXmlName === xmlName;
  return findKey(styleMap, predicate) ?? "text";
}

export function getOutlineLvlName(outlineLvl) {
  const predicate = ({ docxStyles = null }) => outlineLvl === docxStyles?.outlineLevel;
  return findKey(styleMap, predicate) ?? "text";
}

export function getStyles() {
  return Object.keys(styleMap);
}

export function getHeadingStyles() {
  return Object.keys(styleMap).filter(
    (key) => styleMap[key].heading
  );
}

export function getDocxStyles(styles) {
  const mergedStyles = Object.keys(styles).reduce((acc, key) => {
    return {
      ...acc,
      ...styleMap[key]?.docxStyles,
    };
  }, {});
  return mergedStyles;
}

export function tokensToMarkup(textBlocks, plainTextOnly = false) {
  let dom = "";
  const state = { underline: false, strong: false, mark: false };

  textBlocks.forEach(({ format, tokens }) => {
    if (!tokens.length) return;
    const { domElement } = styleMap[format];

    if (!plainTextOnly) 
      dom += ` <${domElement}>`;
    tokens.forEach(({ text, format }) => {
      if (!text || text.trim().length < 1) return;
      let tags = "";
      for (const style in state) {
        if (state[style] !== format[style]) {
          const elName = styleMap[style]?.domElement;
          tags += format[style] ?  ` <${elName}>` : `</${elName}> `;
          state[style] = format[style];
        }
      }

      if (plainTextOnly) dom += text;
      else dom += tags + text;
    });

    if (plainTextOnly) dom += " \n";
    else dom += ` </${domElement}> `;
  });

  if (!plainTextOnly)
    for (const style in state)
      if (state[style])
        dom += `</${styleMap[style]?.domElement}> `;
  dom = dom.replace(/ \n/, " ").replace(/\s+/, " ");

  return dom;
}

export function simplifyTokens(block) {
  const simplifiedTokens = block.tokens.reduce((acc, { format, text }) => {
    if (!acc.length) return [{ format, text }];
    const prev = acc[acc.length - 1];
    const { format: prevFormat, text: prevText } = prev;
    // If same format just combine text
    isSameFormat(format, prevFormat)
      ? (prev.text = prevText + text)
      : acc.push({ text, format });
    return acc;
  }, []);
  return { format: block.format, tokens: simplifiedTokens };
}

function isSameFormat(a, b) {
  return a.mark === b.mark && a.strong === b.strong && a.underline === b.underline;
}

function findKey(object, predicate) {
  if (object == null) {
    return undefined;
  }
  const keys = Object.keys(object);
  for (let i = 0, { length } = keys; i < length; i += 1) {
    const key = keys[i];
    const value = object[key];
    if (predicate(value, key, object)) {
      return key;
    }
  }
  return undefined;
}

export const styleMap = {
  pocket: {
    block: true,
    heading: true,
    domSelector: ["h1"],
    domElement: "h1",
    xmlName: "Heading1",
    docxStyles: {
      heading: 1,
      outlineLevel: 1,
    },
  },
  hat: {
    block: true,
    heading: true,
    domSelector: ["h2"],
    domElement: "h2",
    xmlName: "Heading2",
    docxStyles: {
      heading: 2,
      outlineLevel: 2,
    },
  },
  block: {
    block: true,
    heading: true,
    domSelector: ["h3"],
    domElement: "h3",
    xmlName: "Heading3",
    docxStyles: {
      heading: 3,
      outlineLevel: 3,
    },
  },
  tag: {
    block: true,
    heading: true,
    domSelector: ["h4"],
    domElement: "h4",
    xmlName: "Heading4",
    docxStyles: {
      heading: 4,
      outlineLevel: 4,
    },
  },
  text: {
    block: true,
    heading: false,
    domSelector: ["p"],
    domElement: "p",
  },
  underline: {
    block: false,
    heading: false,
    domSelector: ["span", "u"],
    domElement: "u",
    docxStyles: {
      underline: {},
    },
  },
  strong: {
    block: false,
    heading: false,
    domSelector: ["strong"],
    domElement: "b",
    docxStyles: {
      bold: true,
    },
  },
  mark: {
    block: false,
    heading: false,
    domSelector: ["mark"],
    domElement: "mark",
    docxStyles: {
      highlight: "cyan",
    },
  },
};

vtempest avatar Sep 02 '24 19:09 vtempest

Can you raise the PR with relevant changes?

Thanks

skfrost19 avatar Sep 03 '24 18:09 skfrost19