How do I extract html tag elements with Python?

abcfhy
May 19, 2023
268 views
0 votes
2 Answers

I am trying to scrape the html tags for other uses, but it seems the arrangement of the html file prevents me using ordinary codes I can find to locate the elements

The element is rather long (there are 70+ options in the list), so I am showing only the first few items here:

<select title="Please select Automatic Weather Observations" id="aws" onchange="changeYearDropDown2()">
    <option value="">===Manned Weather Station===</option>
    <option value="HKO">Hong Kong Observatory</option>
    <option value="HKA">Hong Kong International Airport</option>
    <option value=""></option>
    <option value="">===Automatic Weather Station===</option>
    <option value="BR1">Beas River</option>
    <option value="BHD">Bluff Head</option>
    <option value="CP1">Central Pier</option>
    <option value="CCH">Cheung Chau</option>
    <option value="CPH">Ching Pak House(Tsing Yi)</option>

What I actually want are the values inside the tag AND the full name of stations as two separate lists. And since there are also some ‘blank’ tags in the html, I just don’t know how to get rid of them and extract only those with values and names…

Any help is appreciated! Thanks a lot in advance!!

Here are some methods I tried, and I don’t know why the find_all or find_elements fail:

Using BeautifulSoup:

import requests
from bs4 import BeautifulSoup as bs
  
url = 'https://www.hko.gov.hk/en/cis/climat.htm'
  
req = requests.get(url)
soup = bs(req.text, 'html.parser')

#method 1
print(soup.select)    #it prints the whole document for me
print(soup.option)    #empty list

#method 2
titles = soup.find_all('option')
print(titles)    #empty list

Using Selenium:

from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By

service = Service(executable_path="path/chromedriver_win32/chromedriver.exe")
#initialize web driver
with webdriver.Chrome(service=service) as driver:
    #navigate to the url
    driver.get(url)

#method 3: find element by CSS selector
    myDiv = driver.find_element(By.CSS_SELECTOR, "select option")    #with or without '>' returns the same output, and find_elements ALWAYS gets me an error
    print(myDiv.get_attribute("outerHTML"))
    print(myDiv.get_attribute("innerHTML")) #exc <select></select> results

#method 4: find element by tag name
    myDiv2 = driver.find_element(By.TAG_NAME, 'select')
    print(myDiv2.get_attribute("outerHTML"))    
    #by far the 'best': return a long line of html with all the html tags and name of stations

#method 5: run a loop
    #still in the same with function shown above
    list = []
    for r in range(1,74):    #there are 73 list elements
        value = driver.find_element(By.XPATH, "/html/body/div[2]/div[2]/div/div/div[4]/div/div[3]/div[1]/div/div[2]/div[2]/div[1]/div/ul/li[1]/select/option["+str(r)+"]").text
        list.append(value)
    print(list)
    #not bad but I still can't get the tag values
    #using print() just gives me blank list

Answers

You have to wait for the options to load. Most probably the options are being dynamically generated in JS so that’s why beautifulsoup is not working.

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.common.by import By

# Ignore the following settings: (for setup purposes only)
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# chrome_options.add_argument('--headless')

driver = webdriver.Chrome(options=chrome_options)
URL = 'https://www.hko.gov.hk/en/cis/climat.htm'
driver.get(URL)
time.sleep(3) # wait for options to be loaded

optionContainer = driver.find_element(By.XPATH, '//*[@id="aws"]')
options = optionContainer.find_elements(By.TAG_NAME, "option")

for option in options:
  name = option.get_attribute("textContent")
  value = option.get_attribute("value")
  if len(name)>0 and len(value)>0:
    print(name, value)
    # do something

driver.quit()

You may try with selenium this way:

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC

driver = Chrome()
driver.get('https://www.hko.gov.hk/en/cis/climat.htm')
options = WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'select#aws > option')))
for option in options:
    print(option.get_attribute('value'), option.text)

Please signup or login to give your own answer.

Click here to cancel reply.