examples
examples copied to clipboard
Extract ALL text content from the PDF.
trafficstars
I need to extract all the text content from a PDF as soon as it's loaded. I can't find the text value in the onDocumentLoad props and using renderPage renderPageProps.textLayerRendered only gives the text content for the currently scrolling page. I need ALL the text found in the PDF as soon as it is available. Thank you.
Going to answer my own question. Could not figure out how to do it using the library so I implemented the following function using pdfjsLib
function extractText(pdfUrl) {
var pdf = pdfjsLib.getDocument(pdfUrl);
return pdf.promise.then(function (pdf) {
var totalPageCount = pdf.numPages;
var countPromises = [];
for (var currentPage = 1; currentPage <= totalPageCount; currentPage++) {
var page = pdf.getPage(currentPage);
countPromises.push(
page.then(function (page) {
var textContent = page.getTextContent();
return textContent.then(function (text) {
return text.items
.map(function (s) {
return s.str;
})
.join("");
});
})
);
}
return Promise.all(countPromises).then(function (texts) {
return texts.join("");
});
});
}
and called it in the onDocumentLoad. This worked and I am getting all the text content from the PDF.