Cheerio Lib Timing Out During Fetch
Hi, Taniguchi Masaya sama,
As mentioned in my X message, I am having difficulty crawling using the cheerio gs library. The script continues to time out, despite my best work. Please see attached my code below. If there is any support you can lend, it would be greatly appreciated.
function fetchAndExtractBusinessInfo() {
var sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet();
var url = sheet.getRange('B7').getValue();
var maxPages = 10; // Adjust based on the expected number of pages
var batchSize = 3; // Number of pages to process per batch
var retries = 3; // Number of retries for each fetch
var startRow = 7; // Starting row for data
var startTime = new Date().getTime();
var maxExecutionTime = 5 * 60 * 1000; // 5 minutes
// Initialize row counters for each column
var nameRow = startRow;
var numberRow = startRow;
var emailRow = startRow;
var websiteRow = startRow;
for (var page = 1; page <= maxPages; page++) {
if (new Date().getTime() - startTime > maxExecutionTime - 30000) {
// Stop execution if close to the 5-minute limit
break;
}
var paginatedUrl = url + "&page=" + page;
var success = false;
for (var attempt = 0; attempt < retries; attempt++) {
try {
// Fetch HTML content from the URL with a shorter timeout
var response = UrlFetchApp.fetch(paginatedUrl, { muteHttpExceptions: true, timeout: 20000 });
var html = response.getContentText();
if (response.getResponseCode() !== 200) {
throw new Error('Failed to fetch URL: ' + paginatedUrl + ' (status: ' + response.getResponseCode() + ')');
}
// Regular expressions to match the required data
var nameRegex = /<div class="MuiTypography-root jss323 MuiTypography-h3 MuiTypography-displayBlock">([^<]*)<\/div>/g;
var numberRegex = /<span class="MuiButton-label">([^<]*)<\/span>/g;
var emailRegex = /<a[^>]*href="mailto:([^"]*)"/g;
var websiteRegex = /<a class="MuiButtonBase-root MuiButton-root MuiButton-text ButtonWebsite MuiButton-textSecondary MuiButton-fullWidth" href="([^"]*)"/g;
// Extract business names
var businessNames = extractMatches(nameRegex, html);
// Extract business numbers
var businessNumbers = extractMatches(numberRegex, html);
// Extract business email addresses
var businessEmails = extractMatches(emailRegex, html);
// Extract business websites
var businessWebsites = extractMatches(websiteRegex, html);
nameRow = writeDataToSheet(sheet, businessNames, 'C', nameRow);
numberRow = writeDataToSheet(sheet, businessNumbers, 'D', numberRow);
emailRow = writeDataToSheet(sheet, businessEmails, 'E', emailRow);
websiteRow = writeDataToSheet(sheet, businessWebsites, 'F', websiteRow);
success = true;
break;
} catch (e) {
Logger.log('Attempt ' + (attempt + 1) + ' failed: ' + e.toString());
}
}
// Pause between batches to prevent hitting execution time limits
Utilities.sleep(2000);
}
}
// Function to extract matches using a regex pattern
function extractMatches(regex, html) {
var matches = [];
var match;
while (match = regex.exec(html)) {
matches.push(match[1]);
}
return matches;
}
// Function to convert a column letter to its corresponding index
function columnLetterToIndex(column) {
return column.charCodeAt(0) - 'A'.charCodeAt(0) + 1;
}
// Function to write data to the sheet starting at the given cell and return the next available row
function writeDataToSheet(sheet, data, column, startRow) {
var columnIndex = columnLetterToIndex(column);
for (var i = 0; i < data.length; i++) {
sheet.getRange(startRow + i, columnIndex).setValue(data[i]);
}
return startRow + data.length;
}
Best regards, Samuel Shilson-Josling
Thank you for reporting issue. I've read your code but I could not find any code regarding my library. Would you mind to tell me where you use Cheerio.load ?
Hi Taniguchi Masaya sama,
Apologies for my inefficiency. I attached an older version of the script to the Github post I referred to earlier. Please find attached the correct version of the script below - including use of the cheerio gs library:
function crawlBusinessInfo() {
// Load Cheerio library
const cheerio = Cheerio.load;
// Get the URL from cell B7
const sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet();
const url = sheet.getRange('B7').getValue();
// Fetch HTML content
const response = UrlFetchApp.fetch(url);
const html = response.getContentText();
// Parse HTML with Cheerio
const $ = cheerio(html);
// Extract business information
const businessName = $(
'div.MuiTypography-root.jss324.MuiTypography-h3.MuiTypography-displayBlock'
).text().trim();
const phoneNumber = $(
'a.MuiButtonBase-root.MuiButton-root.MuiButton-text.ButtonPhone.wobble-call.MuiButton-textPrimary.MuiButton-fullWidth'
)
.attr('href')
.replace('tel:', '')
.trim();
const website = $(
'a.MuiButtonBase-root.MuiButton-root.MuiButton-text.ButtonWebsite.MuiButton-textSecondary.MuiButton-fullWidth'
)
.attr('href')
.trim();
const emailAddress = $('div.contact.contact-main.contact-email')
.data('email')
.trim();
// Write data to cells
sheet.getRange('C7').setValue(businessName);
sheet.getRange('D7').setValue(phoneNumber);
sheet.getRange('E7').setValue(website);
sheet.getRange('F7').setValue(emailAddress);
}
Best regards, Samuel Shilson-Josling
On Mon, Aug 19, 2024 at 12:14 PM TANIGUCHI Masaya @.***> wrote:
Thank you for reporting issue. I've read your code but I could not find any code regarding my library. Would you mind to tell me where you use Cheerio.load ?
— Reply to this email directly, view it on GitHub https://github.com/tani/cheeriogs/issues/271#issuecomment-2295539877, or unsubscribe https://github.com/notifications/unsubscribe-auth/BKTYCDIGGEDL4ESI6NKXJFDZSFIJXAVCNFSM6AAAAABMW3JVX6VHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDEOJVGUZTSOBXG4 . You are receiving this because you authored the thread.Message ID: @.***>
--
Samuel Shilson-Josling Founder +61416 775 468 https://www.redseo.com.au
Thanks for sending me the new code. From what I've seen, the code looks fine, it's just odd that it Timeout's. I'll test it here as well and report back in this thread as needed. Best regards.
Thank you, Taniguchi Masaya san. I look forward to hearing from you soon. If you can come up with a solution that would be great. Please let me know your findings.
Best regards,
On Mon, Aug 19, 2024 at 8:05 PM TANIGUCHI Masaya @.***> wrote:
Thanks for sending me the new code. From what I've seen, the code looks fine, it's just odd that it Timeout's. I'll test it here as well and report back in this thread as needed. Best regards.
— Reply to this email directly, view it on GitHub https://github.com/tani/cheeriogs/issues/271#issuecomment-2296180331, or unsubscribe https://github.com/notifications/unsubscribe-auth/BKTYCDM3H7Y5PLT7SNHB6UDZSG7OZAVCNFSM6AAAAABMW3JVX6VHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDEOJWGE4DAMZTGE . You are receiving this because you authored the thread.Message ID: @.***>
--
Samuel Shilson-Josling Founder +61416 775 468 https://www.redseo.com.au
Konnichiwa, Taniguchi Masaya,
I trust you are well and things are fine. I am just following up on my previous email to ask if you had any luck debugging my code yet? It would be great if I could do some research this weekend for my business.
Thank you for your continued support and encouragement.
Best regards, Samuel
On Tue, 20 Aug 2024, 7:16 am Samuel Shilson-Josling, @.***> wrote:
Thank you, Taniguchi Masaya san. I look forward to hearing from you soon. If you can come up with a solution that would be great. Please let me know your findings.
Best regards,
On Mon, Aug 19, 2024 at 8:05 PM TANIGUCHI Masaya @.***> wrote:
Thanks for sending me the new code. From what I've seen, the code looks fine, it's just odd that it Timeout's. I'll test it here as well and report back in this thread as needed. Best regards.
— Reply to this email directly, view it on GitHub https://github.com/tani/cheeriogs/issues/271#issuecomment-2296180331, or unsubscribe https://github.com/notifications/unsubscribe-auth/BKTYCDM3H7Y5PLT7SNHB6UDZSG7OZAVCNFSM6AAAAABMW3JVX6VHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDEOJWGE4DAMZTGE . You are receiving this because you authored the thread.Message ID: @.***>
--
Samuel Shilson-Josling Founder +61416 775 468 https://www.redseo.com.au
Ohaiyou gozaismasu, Taniguchi Masaya sama,
I trust you are fine and things are fine. I am just checking in on this thread to see if you have had any issues debugging my most-recent code yet. I would appreciate it: to hear back from you.
Thank you for your continued support and encouragement. Do your best!
Best regards,
On Fri, Aug 23, 2024 at 1:38 PM Samuel Shilson-Josling @.***> wrote:
Konnichiwa, Taniguchi Masaya,
I trust you are well and things are fine. I am just following up on my previous email to ask if you had any luck debugging my code yet? It would be great if I could do some research this weekend for my business.
Thank you for your continued support and encouragement.
Best regards, Samuel
On Tue, 20 Aug 2024, 7:16 am Samuel Shilson-Josling, @.***> wrote:
Thank you, Taniguchi Masaya san. I look forward to hearing from you soon. If you can come up with a solution that would be great. Please let me know your findings.
Best regards,
On Mon, Aug 19, 2024 at 8:05 PM TANIGUCHI Masaya < @.***> wrote:
Thanks for sending me the new code. From what I've seen, the code looks fine, it's just odd that it Timeout's. I'll test it here as well and report back in this thread as needed. Best regards.
— Reply to this email directly, view it on GitHub https://github.com/tani/cheeriogs/issues/271#issuecomment-2296180331, or unsubscribe https://github.com/notifications/unsubscribe-auth/BKTYCDM3H7Y5PLT7SNHB6UDZSG7OZAVCNFSM6AAAAABMW3JVX6VHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDEOJWGE4DAMZTGE . You are receiving this because you authored the thread.Message ID: @.***>
--
Samuel Shilson-Josling Founder +61416 775 468 https://www.redseo.com.au
--
Samuel Shilson-Josling Founder +61416 775 468 https://www.redseo.com.au
Hey, Taniguchi Masaya sama,
Can I please have some code? The D'Souza family at Sensis are blocking my crawl from augmenting my business with the efficiency metric of time.
As always, I look forward to hearing from you soon.
Thank you for your continued support, encouragement and understanding. Do your best!
Best regards,
On Wed, Aug 28, 2024 at 11:58 AM Samuel Shilson-Josling < @.***> wrote:
Ohaiyou gozaismasu, Taniguchi Masaya sama,
I trust you are fine and things are fine. I am just checking in on this thread to see if you have had any issues debugging my most-recent code yet. I would appreciate it: to hear back from you.
Thank you for your continued support and encouragement. Do your best!
Best regards,
On Fri, Aug 23, 2024 at 1:38 PM Samuel Shilson-Josling < @.***> wrote:
Konnichiwa, Taniguchi Masaya,
I trust you are well and things are fine. I am just following up on my previous email to ask if you had any luck debugging my code yet? It would be great if I could do some research this weekend for my business.
Thank you for your continued support and encouragement.
Best regards, Samuel
On Tue, 20 Aug 2024, 7:16 am Samuel Shilson-Josling, < @.***> wrote:
Thank you, Taniguchi Masaya san. I look forward to hearing from you soon. If you can come up with a solution that would be great. Please let me know your findings.
Best regards,
On Mon, Aug 19, 2024 at 8:05 PM TANIGUCHI Masaya < @.***> wrote:
Thanks for sending me the new code. From what I've seen, the code looks fine, it's just odd that it Timeout's. I'll test it here as well and report back in this thread as needed. Best regards.
— Reply to this email directly, view it on GitHub https://github.com/tani/cheeriogs/issues/271#issuecomment-2296180331, or unsubscribe https://github.com/notifications/unsubscribe-auth/BKTYCDM3H7Y5PLT7SNHB6UDZSG7OZAVCNFSM6AAAAABMW3JVX6VHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDEOJWGE4DAMZTGE . You are receiving this because you authored the thread.Message ID: @.***>
--
Samuel Shilson-Josling Founder +61416 775 468 https://www.redseo.com.au
--
Samuel Shilson-Josling Founder +61416 775 468 https://www.redseo.com.au
--
Samuel Shilson-Josling Founder +61416 775 468 https://www.redseo.com.au