skip to Main Content

I want to create a catalog of courses by scraping data from a website. I want to go to each item in the url https://www.coursicle.com/harvard/courses/
and pull all the course names within each item. I am using the below code

import requests
from bs4 import BeautifulSoup


def scrape_harvard_course_names():
        url = "https://www.coursicle.com/harvard/courses/"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        tile_container = soup.find("div", id="tileContainer")
        links = tile_container.find_all("a")

        course_names = []
        for link in links:
            course_name = link.text
            course_names.append(course_name)

            # Click the link to get the course name
            response = requests.get(link["href"])
            soup = BeautifulSoup(response.content, "html.parser")
            course_name_from_link = soup.find("h1", class_="course-name").text
            course_names.append(course_name_from_link)

        return course_names


course_names = scrape_harvard_course_names()

for course_name in course_names:
        print(course_name)

in this code soup.find("div", id="tileContainer") doesn’t return anything. Hence this code doesn’t work. Is there a way to scrape this data?

2

Answers


  1. you need add header to request

        url = "https://www.coursicle.com/harvard/courses/"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
        }
        response = requests.get(url, headers=headers)
    

    your code has another problem is the path of href is relative, you need to get absolute path to visit (you should import urljoin first: from urllib.parse import urljoin)

            # Click the link to get the course name
            link.attrs['href'] = urljoin(url, link.get('href'))
            response = requests.get(link["href"])
    
    Login or Signup to reply.
  2. you can use this :

    import requests
    from bs4 import BeautifulSoup
    
    
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'en-US,en;q=0.9,ar;q=0.8,fr;q=0.7',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        # 'Cookie': '_ga=GA1.1.741382807.1690177394; _ga_SRFRW1PCBK=GS1.1.1690177394.1.1.1690177622.0.0.0',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.86',
        'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Microsoft Edge";v="114"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }
    
    response = requests.get('https://www.coursicle.com/harvard/courses/', headers=headers)
    
    # Parse the HTML
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all 'a' elements with class 'tileElement'
    a_elements = soup.find_all('a', class_='tileElement')
    
    # Extract and print information for each 'a' element
    for a_element in a_elements:
     
     subject_name = a_element.find('span', class_='tileElementText').text.strip()
        if subject_name:
            print("Subject Name:", subject_name)
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search