Html - Web-scraping using beautiful soup

Steve
February 8, 2024
201 views
1 vote
2 Answers

I have this completed for one website, then the html code on the second website is much different.

I have tried to find the information online in how to write the code differently but not luck.

I am wanting to pull the top 3 results from each URL, append them to .csv, then save. The website code is in the image, my code is below. All help is appreciated

from requests_html import HTMLSession
import pandas as pd

urls = [ 'https://www.businesswire.com/portal/site/home/search/?searchType=all&searchTerm=delix&searchPage=1',
    
]

for url in urls:
    r = s.get(url)
    content = r.html.find('div.bw-news-list li')

    for item in content:
        try:
            title = item.find('h3 a', first=True).text
        except Exception as e:
            print(f"Error getting title: {e}")
            title = ''

        try:
            date = item.find('span.bw-date', first=True).text
        except Exception as e:
            print(f"Error getting date: {e}")
            date = ''

        try:
            summary = item.find('span.bw-news-item-summary', first=True).text
        except Exception as e:
            print(f"Error getting summary: {e}")
            summary = ''

        try:
            url = item.find('h3 a', first=True).absolute_links.pop()
        except Exception as e:
            print(f"Error getting URL: {e}")
            url = ''

        entry_dict = {
            'Title': title,
            'Date': date,
            'Summary': summary,
            'URL': url
        }

        data.append(entry_dict)


df = pd.DataFrame(data)
df.to_csv('BusinessWire_Information.csv', index=False)

print('Finished')

This is what the code looks like on the website:

Results in a blank csv

Answers

You don’t see anything because the data is loaded from different URL via JavaScript. Here is an example how you can load the titles:

import requests
from bs4 import BeautifulSoup

url = "https://www.businesswire.com/portal/site/home/template.BINARYPORTLET/search/resource.process/"

params = {
    "javax.portlet.tpst": "92055fbcbec7e639f1f554100d908a0c",
    "javax.portlet.rst_92055fbcbec7e639f1f554100d908a0c_searchTerm": "delix",  # <-- your search term
    "javax.portlet.rst_92055fbcbec7e639f1f554100d908a0c_resultsPage": "1",
    "javax.portlet.rst_92055fbcbec7e639f1f554100d908a0c_searchType": "all",
    "javax.portlet.rid_92055fbcbec7e639f1f554100d908a0c": "searchPaging",
    "javax.portlet.rcl_92055fbcbec7e639f1f554100d908a0c": "cacheLevelPage",
    "javax.portlet.begCacheTok": "com.vignette.cachetoken",
    "javax.portlet.endCacheTok": "com.vignette.cachetoken",
}

soup = BeautifulSoup(requests.get(url, params=params).content, "html.parser")

for h3 in soup.select(".bw-news-list h3")[:3]:
    print(h3.text)
    print(h3.a["href"])
    print("-" * 80)

Prints:

Delix Therapeutics Awarded National Institutes of Health Grant to Advance Vital Research of Novel Neuroplastogen for Substance Use Disorders
http://www.businesswire.com/news/home/20231213417680/en/Delix-Therapeutics-Awarded-National-Institutes-of-Health-Grant-to-Advance-Vital-Research-of-Novel-Neuroplastogen-for-Substance-Use-Disorders
--------------------------------------------------------------------------------
Delix Presents Interim Data From Phase I Trial of Novel Neuroplastogen at ACNP Annual Meeting
http://www.businesswire.com/news/home/20231206561331/en/Delix-Presents-Interim-Data-From-Phase-I-Trial-of-Novel-Neuroplastogen-at-ACNP-Annual-Meeting
--------------------------------------------------------------------------------
Freedom Biosciences Emerges from Stealth with $10.5 Million Seed Financing to Develop Next-Generation Ketamine and Psychedelic-based Mental Health Treatments
http://www.businesswire.com/news/home/20220823005203/en/Freedom-Biosciences-Emerges-from-Stealth-with-10.5-Million-Seed-Financing-to-Develop-Next-Generation-Ketamine-and-Psychedelic-based-Mental-Health-Treatments
--------------------------------------------------------------------------------

I think for this kind of stuff is better to also use selenium to load dynamic contents before scraping.

Here is an example code I wrote for amazon scraping:

import os
import json
import time
import urllib3
import xmltodict

from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.firefox import GeckoDriverManager

# disable SSL certificate verification
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

'''
@version amazon
@author Marco Riccetti
'''
class WebScrapy:

    '''
    '''
    class SetEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, set):
                return list(obj)
            return json.JSONEncoder.default(self, obj)

    '''
    '''
    def __init__(self, log_enabled, max_timeout = 1, firefox_options = Options()) -> None:
        # init params
        self._log_enabled = log_enabled
        self._max_timeout = max_timeout
        self._firefox_options = firefox_options
        # set headless
        self._firefox_options.add_argument('--headless')

    '''
    '''
    def _extract_data(self, text):
        # create data array
        data = []
        # scan tags
        for element in BeautifulSoup(text, 'html.parser').select('[class^="DealGridItem-module"]'):
            data.append(xmltodict.parse(str(element)))
        # return the data
        return data

    '''
    '''
    def _get_pages(self, driver, domain, max_page_next):
        # declare page data list
        page_data = []
        # try
        try:
            # send a GET request to the URL with the custom headers
            driver.get(f'https://{domain}/deals?ref_=nav_cs_gb')
            # wait for rhf-frame (last loaded)
            wait = WebDriverWait(driver, 10)
            # wait until page load complete rhf-frame loaded (last one to be loaded)
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.rhf-frame')))
            # scan until max_page_next
            for i in range(1, max_page_next + 1):
                # check for log enabled
                if self._log_enabled:
                    print(f'Current Page {i} / {max_page_next}')
                # wait 5s to be sure
                time.sleep(5)
                # get current page data
                page_data.append(self._extract_data(driver.page_source))
                # get current page index
                page_index = driver.find_element(By.CSS_SELECTOR, '.a-pagination .a-selected a').text
                # go to next page
                driver.execute_script("document.querySelector('.a-last a').click()")
                # wait until page index changes
                wait.until(lambda driver: driver.find_element(By.CSS_SELECTOR, '.a-pagination .a-selected a').text != page_index)
            # return page data list
            return page_data
        except Exception as ex:
            # check for log enabled
            if self._log_enabled:
                print(f'Exception: {ex}')
            # return page data list
            return page_data

    '''
    '''
    def execute(self, seed_domains, max_page_next, output_path):
        # create a new Firefox instance
        driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=self._firefox_options)
        # set timeouts
        driver.implicitly_wait(30)
        driver.set_script_timeout(30)
        driver.set_page_load_timeout(60)
        # try
        try:
            # get current date time
            now = datetime.now()
            # create output dir if not exits
            os.makedirs(output_path, exist_ok=True)
            # scan seed_domains
            for seed_domain in seed_domains:
                # check for log enabled
                if self._log_enabled:
                    print(f'Scraping seed domain [{seed_domain}] ...')
                # get pages
                page_data = self._get_pages(driver, seed_domain, max_page_next)
                # open the file in write mode
                with open(f'{output_path}/{seed_domain}_{now.strftime("%Y-%m-%d %H:%M:%S")}.json', 'w') as file:
                    # Write the JSON data to the file
                    json.dump(page_data, file, indent=4, cls=WebScrapy.SetEncoder)
        finally:
            # close the driver
            driver.quit()


'''
---------- Main ----------
'''
if __name__ == '__main__':
    #
    ws = WebScrapy(log_enabled = True)
    #
    ws.execute(
        seed_domains = [
            "amazon.ca", # canada
            "amazon.com", # america
            "amazon.com.au", # australia
            "amazon.com.mx", # messico
            "amazon.co.uk",  # inghilterra
            "amazon.de", # germania
            "amazon.es", # spagna
            "amazon.fr", # francia
            "amazon.in", # india
            "amazon.it", # italia
            "amazon.nl", # paesi bassi
        ],
        max_page_next = 10,
        output_path = './outputs'
    )

Please signup or login to give your own answer.

Click here to cancel reply.

Html – Web-scraping using beautiful soup

Answers