skip to Main Content

I have this completed for one website, then the html code on the second website is much different.

I have tried to find the information online in how to write the code differently but not luck.

I am wanting to pull the top 3 results from each URL, append them to .csv, then save. The website code is in the image, my code is below. All help is appreciated

from requests_html import HTMLSession
import pandas as pd

urls = [ 'https://www.businesswire.com/portal/site/home/search/?searchType=all&searchTerm=delix&searchPage=1',
    
]

for url in urls:
    r = s.get(url)
    content = r.html.find('div.bw-news-list li')

    for item in content:
        try:
            title = item.find('h3 a', first=True).text
        except Exception as e:
            print(f"Error getting title: {e}")
            title = ''

        try:
            date = item.find('span.bw-date', first=True).text
        except Exception as e:
            print(f"Error getting date: {e}")
            date = ''

        try:
            summary = item.find('span.bw-news-item-summary', first=True).text
        except Exception as e:
            print(f"Error getting summary: {e}")
            summary = ''

        try:
            url = item.find('h3 a', first=True).absolute_links.pop()
        except Exception as e:
            print(f"Error getting URL: {e}")
            url = ''

        entry_dict = {
            'Title': title,
            'Date': date,
            'Summary': summary,
            'URL': url
        }

        data.append(entry_dict)


df = pd.DataFrame(data)
df.to_csv('BusinessWire_Information.csv', index=False)

print('Finished')

This is what the code looks like on the website:

This is what the code looks like on the website

Results in a blank csv

2

Answers


  1. You don’t see anything because the data is loaded from different URL via JavaScript. Here is an example how you can load the titles:

    import requests
    from bs4 import BeautifulSoup
    
    url = "https://www.businesswire.com/portal/site/home/template.BINARYPORTLET/search/resource.process/"
    
    params = {
        "javax.portlet.tpst": "92055fbcbec7e639f1f554100d908a0c",
        "javax.portlet.rst_92055fbcbec7e639f1f554100d908a0c_searchTerm": "delix",  # <-- your search term
        "javax.portlet.rst_92055fbcbec7e639f1f554100d908a0c_resultsPage": "1",
        "javax.portlet.rst_92055fbcbec7e639f1f554100d908a0c_searchType": "all",
        "javax.portlet.rid_92055fbcbec7e639f1f554100d908a0c": "searchPaging",
        "javax.portlet.rcl_92055fbcbec7e639f1f554100d908a0c": "cacheLevelPage",
        "javax.portlet.begCacheTok": "com.vignette.cachetoken",
        "javax.portlet.endCacheTok": "com.vignette.cachetoken",
    }
    
    soup = BeautifulSoup(requests.get(url, params=params).content, "html.parser")
    
    for h3 in soup.select(".bw-news-list h3")[:3]:
        print(h3.text)
        print(h3.a["href"])
        print("-" * 80)
    

    Prints:

    Delix Therapeutics Awarded National Institutes of Health Grant to Advance Vital Research of Novel Neuroplastogen for Substance Use Disorders
    http://www.businesswire.com/news/home/20231213417680/en/Delix-Therapeutics-Awarded-National-Institutes-of-Health-Grant-to-Advance-Vital-Research-of-Novel-Neuroplastogen-for-Substance-Use-Disorders
    --------------------------------------------------------------------------------
    Delix Presents Interim Data From Phase I Trial of Novel Neuroplastogen at ACNP Annual Meeting
    http://www.businesswire.com/news/home/20231206561331/en/Delix-Presents-Interim-Data-From-Phase-I-Trial-of-Novel-Neuroplastogen-at-ACNP-Annual-Meeting
    --------------------------------------------------------------------------------
    Freedom Biosciences Emerges from Stealth with $10.5 Million Seed Financing to Develop Next-Generation Ketamine and Psychedelic-based Mental Health Treatments
    http://www.businesswire.com/news/home/20220823005203/en/Freedom-Biosciences-Emerges-from-Stealth-with-10.5-Million-Seed-Financing-to-Develop-Next-Generation-Ketamine-and-Psychedelic-based-Mental-Health-Treatments
    --------------------------------------------------------------------------------
    
    Login or Signup to reply.
  2. I think for this kind of stuff is better to also use selenium to load dynamic contents before scraping.

    Here is an example code I wrote for amazon scraping:

    import os
    import json
    import time
    import urllib3
    import xmltodict
    
    from datetime import datetime
    from bs4 import BeautifulSoup
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.firefox.options import Options
    from selenium.webdriver.firefox.service import Service
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from webdriver_manager.firefox import GeckoDriverManager
    
    # disable SSL certificate verification
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    
    '''
    @version amazon
    @author Marco Riccetti
    '''
    class WebScrapy:
    
        '''
        '''
        class SetEncoder(json.JSONEncoder):
            def default(self, obj):
                if isinstance(obj, set):
                    return list(obj)
                return json.JSONEncoder.default(self, obj)
    
        '''
        '''
        def __init__(self, log_enabled, max_timeout = 1, firefox_options = Options()) -> None:
            # init params
            self._log_enabled = log_enabled
            self._max_timeout = max_timeout
            self._firefox_options = firefox_options
            # set headless
            self._firefox_options.add_argument('--headless')
    
        '''
        '''
        def _extract_data(self, text):
            # create data array
            data = []
            # scan tags
            for element in BeautifulSoup(text, 'html.parser').select('[class^="DealGridItem-module"]'):
                data.append(xmltodict.parse(str(element)))
            # return the data
            return data
    
        '''
        '''
        def _get_pages(self, driver, domain, max_page_next):
            # declare page data list
            page_data = []
            # try
            try:
                # send a GET request to the URL with the custom headers
                driver.get(f'https://{domain}/deals?ref_=nav_cs_gb')
                # wait for rhf-frame (last loaded)
                wait = WebDriverWait(driver, 10)
                # wait until page load complete rhf-frame loaded (last one to be loaded)
                wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.rhf-frame')))
                # scan until max_page_next
                for i in range(1, max_page_next + 1):
                    # check for log enabled
                    if self._log_enabled:
                        print(f'Current Page {i} / {max_page_next}')
                    # wait 5s to be sure
                    time.sleep(5)
                    # get current page data
                    page_data.append(self._extract_data(driver.page_source))
                    # get current page index
                    page_index = driver.find_element(By.CSS_SELECTOR, '.a-pagination .a-selected a').text
                    # go to next page
                    driver.execute_script("document.querySelector('.a-last a').click()")
                    # wait until page index changes
                    wait.until(lambda driver: driver.find_element(By.CSS_SELECTOR, '.a-pagination .a-selected a').text != page_index)
                # return page data list
                return page_data
            except Exception as ex:
                # check for log enabled
                if self._log_enabled:
                    print(f'Exception: {ex}')
                # return page data list
                return page_data
    
        '''
        '''
        def execute(self, seed_domains, max_page_next, output_path):
            # create a new Firefox instance
            driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=self._firefox_options)
            # set timeouts
            driver.implicitly_wait(30)
            driver.set_script_timeout(30)
            driver.set_page_load_timeout(60)
            # try
            try:
                # get current date time
                now = datetime.now()
                # create output dir if not exits
                os.makedirs(output_path, exist_ok=True)
                # scan seed_domains
                for seed_domain in seed_domains:
                    # check for log enabled
                    if self._log_enabled:
                        print(f'Scraping seed domain [{seed_domain}] ...')
                    # get pages
                    page_data = self._get_pages(driver, seed_domain, max_page_next)
                    # open the file in write mode
                    with open(f'{output_path}/{seed_domain}_{now.strftime("%Y-%m-%d %H:%M:%S")}.json', 'w') as file:
                        # Write the JSON data to the file
                        json.dump(page_data, file, indent=4, cls=WebScrapy.SetEncoder)
            finally:
                # close the driver
                driver.quit()
    
    
    '''
    ---------- Main ----------
    '''
    if __name__ == '__main__':
        #
        ws = WebScrapy(log_enabled = True)
        #
        ws.execute(
            seed_domains = [
                "amazon.ca", # canada
                "amazon.com", # america
                "amazon.com.au", # australia
                "amazon.com.mx", # messico
                "amazon.co.uk",  # inghilterra
                "amazon.de", # germania
                "amazon.es", # spagna
                "amazon.fr", # francia
                "amazon.in", # india
                "amazon.it", # italia
                "amazon.nl", # paesi bassi
            ],
            max_page_next = 10,
            output_path = './outputs'
        )
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search