skip to Main Content

So I have to scrape all the products from this website’s shop ( https://bewellstore.ro/shop/), but my code stops at the 12th photo. I have made a version for websites with multiple shop pages where I take them all in a for loop, but since here it’s only one page I thought that isn’t necessary.
Any idea why my code stops at the 12th product?



import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

folder = 'beWell_images'
os.makedirs(folder, exist_ok=True)

root_folder = os.getcwd()

baseurl = 'https://bewellstore.ro/shop/'

# an array for all the product links
product_links = [] 

# going through all the pages of the shop
url = 'https://bewellstore.ro/shop/'

r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')

product_list = soup.find_all('div', class_= 'loop-product-inner')
print (product_list)
    # taking all the links to each product page
for item in product_list:
    for link in item.find_all('a', href=True, class_='woocommerce-LoopProduct-link woocommerce-loop-product__link'):
        product_links.append(link['href'])
            # appending the links previously taken to the array
print(product_links)
product_items_list = []

i = 0
d = {}  # use as set() 

os.chdir(folder)

for link_test in product_links:

    r = requests.get(link_test)
    soup = BeautifulSoup(r.content, 'lxml')
    
    title = soup.find('h1', class_='product_title').text.strip()
    price = soup.find('p', class_ = 'price').text.strip()
    header = soup.find('div', class_ = 'woocommerce-product-details__short-description').text.strip()
    sku  = soup.find('span', class_ = 'sku').text.strip()
    categories = soup.find('div' , class_ = 'posted_in').text.strip()
    description = soup.find('div', class_ = 'cell large-6').text.strip()
    brand = soup.find('div', class_ = 'tabs-panel').text.strip()

    images = soup.select('.wp-post-image')
    
    # --- before `for`-loop ---
    
    downloaded = []
    
    # --- `for`-loop ---
    for image in images:
        link = image['src']
        if link in d:
            name = d[link]
            downloaded.append(name)
        else:
            i += 1
            name = str(i) +'img.jpg'
            d[link] = name
            print('link:', link)
            print('name:', name)
            print('---')
            # here i am adding the .jpg and saving the images
            with open(name, 'wb') as f:
                im = requests.get(link)
                #print("URMEAZA DEBUG: {}".format(im))
                f.write(im.content)
            downloaded.append(name)
            
    # --- after `for`-loop ---
    
    # storing all the infos about this product 

    img_str = ''
    if len(downloaded) > 1:
        for index, img in enumerate(downloaded):
            if index == len(downloaded)-1:
                img_str = img_str + img
            else:
                img_str = img_str + img + '/'
    else: 
       img_str = downloaded[0] 


    product = {
        'sku': sku,
        'base_image': img_str,
        'small_image': img_str,
        'thumbnail_image': img_str,
        'additional_images': img_str,
        'product_type': 'simple',
        'attribute_set_code': 'Default',
        'categories': categories.replace('Categorii: ','').replace(', ', '/'),
        'name' : title,
        'description': description,
        'short_description': header,
        'price' : price[0:5]
    }
    product_items_list.append(product)
    
os.chdir(root_folder)
# os.chdir('output')
df = pd.DataFrame(product_items_list)
print(df)
df.to_csv('beWell.csv', index=False)



2

Answers


  1. That’s because this webpage uses pagination (with 12 products per page) and each page gets loaded only when you scroll. You will have to use selenium to scroll the page.

    But if you only want to use beautifulsoup then there is a work around.

    • The URL for each page looks like this

      https://bewellstore.ro/shop/page/<page_no>/
      
    • Example:

      1st page: https://bewellstore.ro/shop/page/1/
      2nd page: https://bewellstore.ro/shop/page/2/
      
    You could make a request to each of the above URLs and scrape your data using beautifulsoup.
    Login or Signup to reply.
  2. You can try this for the all pages

    import requests
    from bs4 import BeautifulSoup
    import csv 
    from datetime import datetime
    
    results = []
    
    page_number = 1
    import requests
    product_links = [] 
    headers = {
        'authority': 'bewellstore.ro',
        'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
        'sec-ch-ua-mobile': '?0',
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
        'sec-ch-ua-platform': '"Linux"',
        'accept': '*/*',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'cors',
        'sec-fetch-dest': 'empty',
        'referer': 'https://bewellstore.ro/shop/',
        'accept-language': 'en-US,en;q=0.9',
        'cookie': 'fp_session=new; mc_landing_site=https://bewellstore.ro/shop/; _omappvp=i5rIyW2xsMFKIu3uhQtmFj1TN9jw7aKjO8dgy3SVvWMhAj30NvKFrBXfJLe3dQK6ZdbB4FezbrwFWPGLdKrsj1A1vqN2PRLI; _omappvs=1634795539874; _clck=1f7zptk|1|evr|0; _ga=GA1.2.2117949575.1634795541; _gid=GA1.2.1155690725.1634795541; _fbp=fb.1.1634795541140.1266696245; PHPSESSID=94b6b1996b0b5e831d898c192b4bca06; _clsk=2489zg|1634795542054|1|1|e.clarity.ms/collect; yith_wcwl_session_d235bd7d63b3a120c05ba3c90256789a=%7B%22session_id%22%3A%222e40c31b1503902767c5327edd3cf926%22%2C%22session_expiration%22%3A1637387542%2C%22session_expiring%22%3A1637383942%2C%22cookie_hash%22%3A%2249a81940bd8d39b2f894021c16333e6f%22%7D; omSeen-dwf9rgtvzzrhqylccaag=1634795583943; om-dwf9rgtvzzrhqylccaag=1634795585931; _omra={"dwf9rgtvzzrhqylccaag":"click"}; cookie_notice_accepted=true; ls_smartpush=fdfbe0ffe7800007',
    }
    
    while True:
        response = requests.get(f'https://bewellstore.ro/shop/page/{page_number}/', headers=headers)
        print(response.status_code)
        print(response.url)
        if response.status_code != 200:
            break
        soup = BeautifulSoup(response.content, 'html.parser')
        product_list = soup.find_all('div', class_= 'loop-product-inner')
        # print (product_list)  
    
        for item in product_list:
            for link in item.find_all('a', href=True, class_='woocommerce-LoopProduct-link woocommerce-loop-product__link'):
                product_links.append(link['href'])
                print('Addedn link in product_links list :', link['href'])
    
        product_items_list = []
        i = 0
        d = {}
    
        for link_test in product_links:
    
            r = requests.get(link_test)
            soup = BeautifulSoup(r.content, 'lxml')
    
            title = soup.find('h1', class_='product_title').text.strip()
            price = soup.find('p', class_ = 'price').text.strip()
            header = soup.find('div', class_ = 'woocommerce-product-details__short-description').text.strip()
            sku  = soup.find('span', class_ = 'sku').text.strip()
            categories = soup.find('div' , class_ = 'posted_in').text.strip()
            description = soup.find('div', class_ = 'cell large-6').text.strip()
            brand = soup.find('div', class_ = 'tabs-panel').text.strip()
    
            images = soup.select('.wp-post-image')
    
            downloaded = []
    
            for image in images:
                link = image['src']
                if link in d:
                    name = d[link]
                    downloaded.append(name)
                else:
                    i += 1
                    name = str(i) +'img.jpg'
                    d[link] = name
                    print('link:', link)
                    print('name:', name)
                    print('---')
                # here i am adding the .jpg and saving the images
                with open(name, 'wb') as f:
                    im = requests.get(link)
                    #print("URMEAZA DEBUG: {}".format(im))
                    f.write(im.content)
                    downloaded.append(name)
    
    
            img_str = ''
            if len(downloaded) > 1:
                for index, img in enumerate(downloaded):
                    if index == len(downloaded)-1:
                        img_str = img_str + img
                    else:
                        img_str = img_str + img + '/'
            else: 
                img_str = downloaded[0] 
    
    
            product = {
            'sku': sku,
            'base_image': img_str,
            'small_image': img_str,
            'thumbnail_image': img_str,
            'additional_images': img_str,
            'product_type': 'simple',
            'attribute_set_code': 'Default',
            'categories': categories.replace('Categorii: ','').replace(', ', '/'),
            'name' : title,
            'description': description,
            'short_description': header,
            'price' : price[0:5]
            }
            product_items_list.append(product)
                        
    
        page_number += 1
    
    
    df = pd.DataFrame(product_items_list)
    print(df)
    df.to_csv('beWell.csv', index=False)
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search