So I have to scrape all the products from this website’s shop ( https://bewellstore.ro/shop/), but my code stops at the 12th photo. I have made a version for websites with multiple shop pages where I take them all in a for
loop, but since here it’s only one page I thought that isn’t necessary.
Any idea why my code stops at the 12th product?
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
folder = 'beWell_images'
os.makedirs(folder, exist_ok=True)
root_folder = os.getcwd()
baseurl = 'https://bewellstore.ro/shop/'
# an array for all the product links
product_links = []
# going through all the pages of the shop
url = 'https://bewellstore.ro/shop/'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
product_list = soup.find_all('div', class_= 'loop-product-inner')
print (product_list)
# taking all the links to each product page
for item in product_list:
for link in item.find_all('a', href=True, class_='woocommerce-LoopProduct-link woocommerce-loop-product__link'):
product_links.append(link['href'])
# appending the links previously taken to the array
print(product_links)
product_items_list = []
i = 0
d = {} # use as set()
os.chdir(folder)
for link_test in product_links:
r = requests.get(link_test)
soup = BeautifulSoup(r.content, 'lxml')
title = soup.find('h1', class_='product_title').text.strip()
price = soup.find('p', class_ = 'price').text.strip()
header = soup.find('div', class_ = 'woocommerce-product-details__short-description').text.strip()
sku = soup.find('span', class_ = 'sku').text.strip()
categories = soup.find('div' , class_ = 'posted_in').text.strip()
description = soup.find('div', class_ = 'cell large-6').text.strip()
brand = soup.find('div', class_ = 'tabs-panel').text.strip()
images = soup.select('.wp-post-image')
# --- before `for`-loop ---
downloaded = []
# --- `for`-loop ---
for image in images:
link = image['src']
if link in d:
name = d[link]
downloaded.append(name)
else:
i += 1
name = str(i) +'img.jpg'
d[link] = name
print('link:', link)
print('name:', name)
print('---')
# here i am adding the .jpg and saving the images
with open(name, 'wb') as f:
im = requests.get(link)
#print("URMEAZA DEBUG: {}".format(im))
f.write(im.content)
downloaded.append(name)
# --- after `for`-loop ---
# storing all the infos about this product
img_str = ''
if len(downloaded) > 1:
for index, img in enumerate(downloaded):
if index == len(downloaded)-1:
img_str = img_str + img
else:
img_str = img_str + img + '/'
else:
img_str = downloaded[0]
product = {
'sku': sku,
'base_image': img_str,
'small_image': img_str,
'thumbnail_image': img_str,
'additional_images': img_str,
'product_type': 'simple',
'attribute_set_code': 'Default',
'categories': categories.replace('Categorii: ','').replace(', ', '/'),
'name' : title,
'description': description,
'short_description': header,
'price' : price[0:5]
}
product_items_list.append(product)
os.chdir(root_folder)
# os.chdir('output')
df = pd.DataFrame(product_items_list)
print(df)
df.to_csv('beWell.csv', index=False)
2
Answers
That’s because this webpage uses pagination (with 12 products per page) and each page gets loaded only when you scroll. You will have to use
selenium
to scroll the page.But if you only want to use
beautifulsoup
then there is a work around.The URL for each page looks like this
Example:
You could make a request to each of the above URLs and scrape your data using
beautifulsoup
.You can try this for the all pages