Web scraping scripts using selenium not running inside docker container

BicycleChrist
December 8, 2023
187 views
3 votes
3 Answers

Have a couple web scraping scripts using selenium and Firefox I’m trying run inside of a docker container. I’m using Arch Linux, but I’ve made sure to closely follow the install directions on the official docker website specific to arch and ran the test container just fine. This container is only trying to run one of these scripts which works on my system in a venv with the same packages installed as listed in the requirements.txt file. I’ve made sure to download compatible browser and driver versions within the docker file as well, so I’m unsure why I’m getting errors that are specific to the web-driver when running the docker.

Errors:

2023-12-06 02:12:20 Traceback (most recent call last):
2023-12-06 02:12:20   File "/home/retupmoc/PycharmProjects/EffortScraper/app/dimeFetch   /AssistFetch.py", line 18, in <module>
2023-12-06 02:12:20     driver = webdriver.Firefox(service=service, options=options)
2023-12-06 02:12:20              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2023-12-06 02:12:20   File "/usr/local/lib/python3.11/site-packages/selenium/webdriver/firefox  /webdriver.py", line 67, in __init__
2023-12-06 02:12:20     super().__init__(command_executor=executor, options=options)
2023-12-06 02:12:20   File "/usr/local/lib/python3.11/site-packages/selenium/webdriver/remote/webdriver.py", line 208, in __init__
2023-12-06 02:12:20     self.start_session(capabilities)
2023-12-06 02:12:20   File "/usr/local/lib/python3.11/site-packages/selenium/webdriver/remote/webdriver.py", line 292, in start_session
2023-12-06 02:12:20     response = self.execute(Command.NEW_SESSION, caps)["value"]
2023-12-06 02:12:20                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2023-12-06 02:12:20   File "/usr/local/lib/python3.11/site-packages/selenium/webdriver/remote/webdriver.py", line 347, in execute
2023-12-06 02:12:20     self.error_handler.check_response(response)
2023-12-06 02:12:20   File "/usr/local/lib/python3.11/site-packages/selenium/webdriver/remote/errorhandler.py", line 229, in check_response
2023-12-06 02:12:20     raise exception_class(message, screen, stacktrace)
2023-12-06 02:12:20 selenium.common.exceptions.WebDriverException: Message: Process unexpectedly closed with status 255

Script:

 import os
import csv
import time
import numpy as np
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.options import Options as FirefoxOptions

# Set up the Firefox options and WebDriver
service = FirefoxService()
options = FirefoxOptions()
options.add_argument('-headless')  # Uncomment if you run in headless mode
options.add_argument("--window-size=1920x,1080")
driver = webdriver.Firefox(service=service, options=options)

# Open the webpage
driver.get('https://www.nba.com/stats/players/passing?LastNGames=1&dir=D&sort=POTENTIAL_AST')

# Wait for the table data to load
print('1')
table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'nba-stats-content-block')))
print('2')

headers = table.find_elements(By.TAG_NAME, 'th')
print('3')

header_titles = [header.text for header in headers]
print("Logski")
print(header_titles)

# Replacing redundant "AST" with actual header name
header_titles[9] = "SecondaryAssists"
header_titles[10] = "PotentialAssists"
header_titles[11] = "AssistPointsCreated"
header_titles[12] = "AdjustedAssists"
header_titles[13] = "AstToPass%"
header_titles[14] = "AdjAstToPass%"

xbutton = WebDriverWait(driver,3).until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".onetrust-close-btn-handler.onetrust-close-btn-ui.banner-close-button.ot-close-icon")))
xbutton.click()


# Conditional assignments
if len(header_titles) > 9:
    header_titles[9] = "SecondaryAssists"
if len(header_titles) > 10:
    header_titles[10] = "PotentialAssists"
if len(header_titles) > 11:
    header_titles[11] = "AssistPointsCreated"

# Sort the table by clicking on the table header (sorting by rebounds)
time.sleep(np.random.uniform(3.80,6.0))
header_to_sort = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//th[text()="AST"]')))
time.sleep(np.random.uniform(1.0,1.45))
header_to_sort.click()
time.sleep(np.random.uniform(4.0,5.35))  # Replace with a more reliable wait condition

# Locate the dropdown by the known prefix of the class name
dropdown_prefix = "DropDown_select"  # This is the consistent prefix of the class name
dropdown = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, f"//select[starts-with(@class, '{dropdown_prefix}')]"))
)

# Select the 'All' option by its value
all_option_value = "-1"
all_option = dropdown.find_element(By.XPATH, f"//option[@value='{all_option_value}']")
all_option.click()

# Get the current date and time for the filename
current_time = datetime.now()
timestamp = current_time.strftime("%Y%m%d_%H%M%S")

#save .csv files to archive folder
directory = "AssistsArchive"
if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory '{directory}' created")

# Open a new CSV file with the timestamp in the name to save the data
filename = os.path.join(directory, f'passing_data_{timestamp}.csv')
table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'nba-stats-content-block')))

with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header_titles)  # Write the headers to the CSV
    rows = table.find_elements(By.TAG_NAME, 'tr')
    for row in rows:
        # Get all the columns for the row
        cols = row.find_elements(By.TAG_NAME, 'td')
        # Write the columns to the CSV
        writer.writerow([col.text for col in cols])

# Close the driver
driver.quit()

DockerFile:

# Use an official Python runtime as a parent image
FROM python:3.11.6

# Set the working directory to /app
WORKDIR /home/retupmoc/PycharmProjects/EffortScraper/app

# Copy the contents of the PyCharmProjects directory into the container at /app
COPY . .

# Install any needed packages specified in requirements.txt
RUN pip install --upgrade pip 
&& pip install -r requirements.txt

# Remove existing geckodriver from PATH if present
RUN rm /usr/local/bin/geckodriver || true

# Install GeckoDriver
RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-    linux64.tar.gz 
&& tar -zxvf geckodriver-v0.33.0-linux64.tar.gz 
&& mv geckodriver /usr/local/bin/ 
&& rm geckodriver-v0.33.0-linux64.tar.gz

# Define environment variable
ENV NAME comboFetch

# Run dimeFetch and boundFetch scripts by default when the container launches
CMD ["python", "dimeFetch/AssistFetch.py"]

Answers

- Evan
- December 7, 2023 at 8:24 am
- 0 votes
0
It looks like the core part of your error is this:
```
2023-12-06 02:12:20 selenium.common.exceptions.WebDriverException: Message: Process unexpectedly closed with status 255
```
And doing a quick search for that error brings me to this other post:

Selenium py – Process unexpectedly closed with status 255

So I’d check there for a solution.
Login or Signup to reply.

Better solution is to not use Selenium. The data is returned in a nice json format.

import requests
import pandas as pd


url = 'https://stats.nba.com/stats/leaguedashptstats'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'referer': 'https://www.nba.com/'}
payload = {
    'LastNGames': '1',
    'LeagueID': '00',
    'Location': '',
    'Month': '0',
    'OpponentTeamID': '0',
    'Outcome': '',
    'PORound': '0',
    'PerMode': 'PerGame',
    'PlayerExperience': '',
    'PlayerOrTeam': 'Player',
    'PlayerPosition': '',
    'PtMeasureType': 'Passing',
    'Season': '2023-24',
    'SeasonSegment': '',
    'SeasonType': 'Regular Season',
    'StarterBench': '',
    'TeamID': '0'}

jsonData = requests.get(url, headers=headers, params=payload, timeout=10).json()
data = jsonData['resultSets'][0]
df = pd.DataFrame(data['rowSet'], columns=data['headers'])

Output:

1st 5 rows of 263

print(df.head().to_string())
   PLAYER_ID    PLAYER_NAME     TEAM_ID TEAM_ABBREVIATION  GP  W  L   MIN  PASSES_MADE  PASSES_RECEIVED  AST  FT_AST  SECONDARY_AST  POTENTIAL_AST  AST_POINTS_CREATED  AST_ADJ  AST_TO_PASS_PCT  AST_TO_PASS_PCT_ADJ
0    1630639    A.J. Lawson  1610612742               DAL   1  1  0   6.7          4.0              4.0  0.0     0.0            0.0            0.0                 0.0      0.0            0.000                0.000
1     203932   Aaron Gordon  1610612743               DEN   1  0  1  31.3         28.0             21.0  2.0     0.0            0.0            4.0                 6.0      2.0            0.071                0.071
2    1628988  Aaron Holiday  1610612745               HOU   1  1  0  26.9         42.0             47.0  4.0     0.0            0.0            6.0                12.0      4.0            0.095                0.095
3    1630598  Aaron Wiggins  1610612760               OKC   1  0  1  15.5         13.0             13.0  0.0     0.0            1.0            3.0                 0.0      1.0            0.000                0.077
4     201143     Al Horford  1610612738               BOS   1  0  1  31.4         23.0             17.0  4.0     0.0            0.0            7.0                12.0      4.0            0.174                0.174

Try to install chrome in the docker container:

# Use an official Python runtime as a parent image
FROM python:3.11.6

# Set the working directory to /app
WORKDIR /home/retupmoc/PycharmProjects/EffortScraper/app

# Get chrome
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -
RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list'
RUN apt-get -y update
# Install
RUN apt-get install -y google-chrome-stable
RUN apt-get install -yqq unzip
RUN wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip
RUN unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
ENV DISPLAY=:99



# Copy the contents of the PyCharmProjects directory into the container at /app
COPY . .

# Install any needed packages specified in requirements.txt
RUN pip install --upgrade pip 
&& pip install -r requirements.txt

# Remove existing geckodriver from PATH if present
RUN rm /usr/local/bin/geckodriver || true

# ... rest of your code

And use it like this in the script:

# Selenium imports
from selenium.webdriver.chrome.options import Options
from selenium import webdriver

def set_chrome_options():
    """
      Sets chrome options for Selenium.
      Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = { "images": 2 }
    return chrome_options

browser = webdriver.Chrome(options=set_chrome_options())

browser.get("some-url")

rows = browser.find_elements_by_xpath("your-xpath selector")

Please signup or login to give your own answer.

Click here to cancel reply.