Docx-Viewer
Docx-Viewer copied to clipboard
Formatting set by style classes is missing (fix inside)
Here's the fix, you have to unzip both document and styles.xml and then interpolate
import JSZip from "jszip";
import { Parser } from "htmlparser2";
import * as tokens from "./docx-tokens";
/**
* Converts docx file to array or token objects with text and formatting
* 1 - open document.xml and styles.xml by unzipping .docx file
* 2 - tokenize document.xml and pull info on named styles from styles.xml
* @param {string|File|Blob|ArrayBuffer} docxInput - File Path, File object, Blob, or ArrayBuffer of a DOCX file
* @param {object} options - { simplified: boolean }
* @returns {Promise<Array>} Array of token objects with text and formatting
*/
export async function documentToTokens(docxInput, options = {}) {
if (!docxInput) return;
let arrayBuffer;
try {
if (typeof docxInput === 'string') {
// If it's a URL
if (docxInput.startsWith('http') || docxInput.startsWith('https')) {
const response = await fetch(docxInput);
arrayBuffer = await response.arrayBuffer();
} else {
// If it's a local file path, this won't work in the browser.
// You'll need to handle file input differently in a browser environment.
throw new Error("Local file paths are not supported in browser environments");
}
} else if (docxInput instanceof File || docxInput instanceof Blob) {
arrayBuffer = await docxInput.arrayBuffer();
} else if (docxInput instanceof ArrayBuffer) {
arrayBuffer = docxInput;
} else {
throw new Error("Unsupported input type");
}
const zip = new JSZip();
await zip.loadAsync(arrayBuffer);
const styleXML = await zip.file("word/styles.xml").async("string");
const docXML = await zip.file("word/document.xml").async("string");
const styleData = await createStyleParser(styleXML);
const blocks = await createTokenizer(docXML, styleData);
if (options.simplified) {
return blocks.map(tokens.simplifyTokens);
}
return blocks;
} catch (e) {
console.error("Error processing DOCX file:", e);
throw e;
}
}
/**
1 - open document.xml
2 - tokenize xml
3 - reconstruct cleaned html
*/
export async function documentToMarkup (filepath) {
const docTokens = await documentToTokens(filepath);
return tokens.tokensToMarkup(docTokens);
};
export async function createStyleParser (styleXML) {
const parsedStyles = {};
let styleName = "";
return await new Promise((resolve, reject) => {
var parser = new Parser(
{
onopentag(name, attributes) {
if (name === "w:style") {
styleName = attributes["w:styleId"];
parsedStyles[styleName] = {
underline: false,
strong: false,
mark: false,
};
}
if (styleName) {
var styles = parsedStyles[styleName];
if (name === "w:u")
styles.underline = attributes["w:val"] !== "none";
else if (
name === "w:highlight" ||
styleName.toLowerCase().includes("highli")
)
styles.mark = true;
else if (name === "w:b")
styles.strong = attributes["w:val"] !== "0";
}
},
onend: () => resolve(parsedStyles),
onerror: reject,
},
{ xmlMode: true }
);
parser.write(styleXML);
parser.end();
});
};
/**
* Parses doc xml to tokenize each text range into
* {text: "", format: { underline, strong, mark }
* @param {string} docXML string from docx unzip
* @param {object} styleData parsed object of style class names
* @returns {array} blocks[]
*/
export async function createTokenizer (docXML, styleData) {
const blocks = [];
let block;
let token;
return await new Promise((resolve, reject) => {
var parser = new Parser(
{
onopentag(name, attributes) {
if (name === "w:p") block = { format: "text", tokens: [] };
else if (name === "w:pStyle")
block.format = tokens.getStyleNameByXml(attributes["w:val"]);
else if (name === "w:outlineLvl")
block.format = tokens.getOutlineLvlName(+attributes["w:val"] + 1);
else if (name === "w:r")
token = {
text: "",
format: { underline: false, strong: false, mark: false },
};
else if (token) {
if (name === "w:rStyle")
token.format = { ...styleData[attributes["w:val"]] };
if (name === "w:u")
token.format.underline = attributes["w:val"] !== "none";
else if (name === "w:highlight") token.format.mark = true;
else if (name === "w:b")
token.format.strong = attributes["w:val"] !== "0";
}
},
ontext(data) {
if (token) token.text += data;
},
onclosetag(name) {
if (name === "w:p" && block.tokens.length) blocks.push(block);
else if (name === "w:r" && token.text) block.tokens.push(token);
},
onend: () => resolve(blocks),
onerror: reject,
},
{ xmlMode: true }
);
parser.write(docXML);
parser.end();
});
};
`
`export function getStyleNameByXml(elXmlName) {
const predicate = ({ xmlName = null }) => elXmlName === xmlName;
return findKey(styleMap, predicate) ?? "text";
}
export function getOutlineLvlName(outlineLvl) {
const predicate = ({ docxStyles = null }) => outlineLvl === docxStyles?.outlineLevel;
return findKey(styleMap, predicate) ?? "text";
}
export function getStyles() {
return Object.keys(styleMap);
}
export function getHeadingStyles() {
return Object.keys(styleMap).filter(
(key) => styleMap[key].heading
);
}
export function getDocxStyles(styles) {
const mergedStyles = Object.keys(styles).reduce((acc, key) => {
return {
...acc,
...styleMap[key]?.docxStyles,
};
}, {});
return mergedStyles;
}
export function tokensToMarkup(textBlocks, plainTextOnly = false) {
let dom = "";
const state = { underline: false, strong: false, mark: false };
textBlocks.forEach(({ format, tokens }) => {
if (!tokens.length) return;
const { domElement } = styleMap[format];
if (!plainTextOnly)
dom += ` <${domElement}>`;
tokens.forEach(({ text, format }) => {
if (!text || text.trim().length < 1) return;
let tags = "";
for (const style in state) {
if (state[style] !== format[style]) {
const elName = styleMap[style]?.domElement;
tags += format[style] ? ` <${elName}>` : `</${elName}> `;
state[style] = format[style];
}
}
if (plainTextOnly) dom += text;
else dom += tags + text;
});
if (plainTextOnly) dom += " \n";
else dom += ` </${domElement}> `;
});
if (!plainTextOnly)
for (const style in state)
if (state[style])
dom += `</${styleMap[style]?.domElement}> `;
dom = dom.replace(/ \n/, " ").replace(/\s+/, " ");
return dom;
}
export function simplifyTokens(block) {
const simplifiedTokens = block.tokens.reduce((acc, { format, text }) => {
if (!acc.length) return [{ format, text }];
const prev = acc[acc.length - 1];
const { format: prevFormat, text: prevText } = prev;
// If same format just combine text
isSameFormat(format, prevFormat)
? (prev.text = prevText + text)
: acc.push({ text, format });
return acc;
}, []);
return { format: block.format, tokens: simplifiedTokens };
}
function isSameFormat(a, b) {
return a.mark === b.mark && a.strong === b.strong && a.underline === b.underline;
}
function findKey(object, predicate) {
if (object == null) {
return undefined;
}
const keys = Object.keys(object);
for (let i = 0, { length } = keys; i < length; i += 1) {
const key = keys[i];
const value = object[key];
if (predicate(value, key, object)) {
return key;
}
}
return undefined;
}
export const styleMap = {
pocket: {
block: true,
heading: true,
domSelector: ["h1"],
domElement: "h1",
xmlName: "Heading1",
docxStyles: {
heading: 1,
outlineLevel: 1,
},
},
hat: {
block: true,
heading: true,
domSelector: ["h2"],
domElement: "h2",
xmlName: "Heading2",
docxStyles: {
heading: 2,
outlineLevel: 2,
},
},
block: {
block: true,
heading: true,
domSelector: ["h3"],
domElement: "h3",
xmlName: "Heading3",
docxStyles: {
heading: 3,
outlineLevel: 3,
},
},
tag: {
block: true,
heading: true,
domSelector: ["h4"],
domElement: "h4",
xmlName: "Heading4",
docxStyles: {
heading: 4,
outlineLevel: 4,
},
},
text: {
block: true,
heading: false,
domSelector: ["p"],
domElement: "p",
},
underline: {
block: false,
heading: false,
domSelector: ["span", "u"],
domElement: "u",
docxStyles: {
underline: {},
},
},
strong: {
block: false,
heading: false,
domSelector: ["strong"],
domElement: "b",
docxStyles: {
bold: true,
},
},
mark: {
block: false,
heading: false,
domSelector: ["mark"],
domElement: "mark",
docxStyles: {
highlight: "cyan",
},
},
};
Can you raise the PR with relevant changes?
Thanks