I’ve made a node.js web scraper code that runs fine on my computer, however, when I deploy to my Google Cloud VM instance running Debian, it returns a timeout error for a specific website. I’ve tried many differnent setups for puppeteer, but none seems to work. I believe the website I’m trying to scrape is blocking my code when I run from the google cloud server, but not when I run from my computer. The scraping part is working fine on my computer. Puppeteer finds the HTML tags and retrieve the info.
const puppeteer = require('puppeteer');
const GoogleSpreadsheet = require('google-spreadsheet');
const { promisify } = require('util');
const credentials = require('./credentials.json');
async function main(){
const scrapCopasa = await scrapCopasaFunction();
console.log('Done!')
}
async function scrapCopasaFunction() {
const browser = await puppeteer.launch({
args: ['--no-sandbox'],
});
const page = await browser.newPage();
//await page.setDefaultNavigationTimeout(0);
//await page.setViewport({width: 1366, height: 768});
await page.setUserAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36');
await page.goto('http://www.copasa.com.br/wps/portal/internet/abastecimento-de-agua/nivel-dos-reservatorios');
//await new Promise(resolve => setTimeout(resolve, 5000));
let isUsernameNotFound = await page.evaluate(() => {
if(document.getElementsByClassName('h2')[0]) {
if(document.getElementsByTagName('h2')[0].textContent == "Sorry, this page isn't available.") {
return true;
}
}
});
if(isUsernameNotFound) {
console.log('Account not exists!');
await browser.close();
return;
}
let reservoirLevelsCopasa = await page.evaluate(() => {
const tds = Array.from(document.querySelectorAll('table tr td'))
return tds.map(td => td.innerText)
});
const riomanso = reservoirLevelsCopasa[13].replace(",",".").substring(0,5);
const serraazul = reservoirLevelsCopasa[17].replace(",",".").substring(0,5);
const vargemdasflores = reservoirLevelsCopasa[21].replace(",",".").substring(0,5);
await browser.close();
return[riomanso, serraazul, vargemdasflores];
}
main();
And error that I’m getting is the following:
(node:6425) UnhandledPromiseRejectionWarning: TimeoutError: Navigation Timeout Exceeded: 30000ms exceeded
at /home/xxx/reservoirs/node_modules/puppeteer/lib/LifecycleWatcher.js:142:21
at async FrameManager.navigateFrame (/home/xxx/reservoirs/node_modules/puppeteer/lib/FrameManager.js:94:17)
at async Frame.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/FrameManager.js:406:12)
at async Page.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/Page.js:674:12)
at async scrapCopasaFunction (/home/xxx/reservoirs/reservatorios.js:129:5)
at async main (/home/xxx/reservoirs/reservatorios.js:9:25)
-- ASYNC --
at Frame.<anonymous> (/home/xxx/reservoirs/node_modules/puppeteer/lib/helper.js:111:15)
at Page.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/Page.js:674:49)
at Page.<anonymous> (/home/xxx/reservoirs/node_modules/puppeteer/lib/helper.js:112:23)
at scrapCopasaFunction (/home/xxx/reservoirs/reservatorios.js:129:16)
at processTicksAndRejections (internal/process/task_queues.js:93:5)
at async main (/home/xxx/reservoirs/reservatorios.js:9:25)
(Use `node --trace-warnings ...` to show where the warning was created)
(node:6425) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async f
unction without a catch block, or by rejecting a promise which was not handled with .catch(). To terminate the node process on unhandled
promise rejection, use the CLI flag `--unhandled-rejections=strict` (see https://nodejs.org/api/cli.html#cli_unhandled_rejections_mode)
. (rejection id: 1)
(node:6425) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not ha
ndled will terminate the Node.js process with a non-zero exit code.
2
Answers
The cloud functions are a bit slow for puppeteer. There were a GitHub issue #3120. regarding this. You can assign more CPU/ram for the function, if that’s a possibility. The more CPU and RAM you provide for chrome, the faster it will be.
You can add a timeout to
goto
, which is maximum navigation time in milliseconds, defaults to 30 seconds, pass 0 to disable timeout.You can also setup the navigation timeout with setDefaultTimeout and setDefaultNavigationTimeout which takes priority over setDefaultTimeout.
The data you’re extracting is already in HTML, so you can fetch HTML with HTTP request and extract data in Node.js script instead of the browser. This will be faster and require fewer resources. If you need to authenticate, you can send a POST request and reuse the cookie in the following GET request. Example in this answer.
Full example
Output