Im using React and I’m attempting to scrape a site from my frontend utilizing a Firebase cloud function and Puppeteer. Oddly enough I get the code working on my local server, but when attempting to implement the same code on my live server via firebase, I’m getting this CORS error below.
Below is the code on my frontend, where I make the request:
import React, {useState} from "react";
import {Typography, Container, TextField, Button} from "@material-ui/core";
import axios from 'axios'
export const Scraper = ({ formData, setForm, navigation }) => {
const { firstName, lastName, displayName, services} = formData;
const [url, setUrl] = useState('')
const [html, setHtml] = useState('')
const handleScrape = async() => {
try {
const response = await axios.get('https://us-central1-cuti-app-7c963.cloudfunctions.net/app/scrape', {
params: {url: url}
})
if(response){
console.log(response.data)
}else{
console.log("Failure Link retrieval")
}
} catch (error) {
console.log("This is the Error:", error)
}
return (
<Container maxWidth="xs" style={{marginTop:'20px', textAlign: "center"}}>
<Typography variant='h4'>Prospect URLL</Typography>
<TextField
label="Booksy"
name="url"
value={url}
onChange={(e)=> {setUrl(e.target.value)}}
margin="normal"
variant="outlined"
autoComplete="off"
required
fullWidth
/>
<Button
variant="contained"
fullWidth
color="primary"
style={{ backgroundColor: '#cf559f',
backgroundSize: "400px",
backgroundPosition: "center",
borderRadius: '0',
color: 'white',
fontWeight: 'bold',
border: '3px #cf559f solid',
letterSpacing: '2px',
marginTop: "1rem" }}
onClick={handleScrape}
>
Next
</Button>
</Container>
);
};
And here is my code snippet from my index.js file form my functions:
const express = require("express");
const app = express();
const cors = require("cors");
app.use(express.urlencoded({extends: true}));
app.use(express.json());
const corsOpts = {
origin: '*',
methods: [
'GET',
'POST',
],
allowedHeaders: [
'Content-Type', 'Authorization', 'Accept'
],
};
app.use(cors(corsOpts))
const functions = require("firebase-functions");
const admin = require('firebase-admin')
admin.initializeApp(functions.config().firebase)
const puppeteer = require('puppeteer');
app.get('/scrape', cors(), async(req, res) => {
let { url } = req.query
try{
// let services = []
const browser = await puppeteer.launch()
const page = await browser.newPage()
page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36');
await page.goto(url)
const proInfo = await page.evaluate(() => {
const displayName = document.querySelector('h1').innerText
const phone = document.querySelector('div[class="purify_dFcdTMoibUeU0IQhEe9mHA=="]').innerText
const imgUrl = document.querySelector('div[class="purify_W9xnvEHvIASJ3h0FC-rz7Q=="]').children[0].currentSrc
const address = document.querySelector('div[class="purify_prm7MfDXczhTZvcY5KwOuA== purify_Sardy6hfiet162IZ2pYFPA== purify_m9mNOPjpHD0tNTW6GC+hEw=="]').innerText
return({img: imgUrl, displayName: displayName, phone: phone, address: address})
}
)
const services = await page.$$eval('div[class="purify_TJBmvp84N9Sj6dyMFksHKg=="]', divs => {
return divs.map(x => {
return({
name: x.children[0].children[0].innerText,
details: x.children[0].children[1].innerText,
price: x.children[1].children[0].children[0].children[0].children[0].children[0].innerText,
duration: x.children[1].children[0].children[0].children[0].children[0].children[1].innerText
})
})
})
res.json({proInfo: proInfo, services: services})
}catch (e) {
console.log("ERROR =>", e)
}
})
Any and all assistance would be GREATLY appreciated!!!
UPDATE #1: Made a couple of edits to the CORS options… but still getting the same ERR with a 500 code..
The img below is what I see on the Network tab
UPDATE #2: Looking deeper into the err, I noticed the logs on GCP and it states "Function invocation was interrupted. Error: memory limit exceeded."
See img below.
The NEW question is: Can we increase the memory limit? If so, how?
2
Answers
Seems like you need to pass a couple of more options to cors.
Try this in your
index.js
file.Hope to be helpful, I had the same problem as you.
To work around I used an NPM library which did the job for me
https://www.npmjs.com/package/scrape-html-web