skip to Main Content

Have a couple web scraping scripts using selenium and Firefox I’m trying run inside of a docker container. I’m using Arch Linux, but I’ve made sure to closely follow the install directions on the official docker website specific to arch and ran the test container just fine. This container is only trying to run one of these scripts which works on my system in a venv with the same packages installed as listed in the requirements.txt file. I’ve made sure to download compatible browser and driver versions within the docker file as well, so I’m unsure why I’m getting errors that are specific to the web-driver when running the docker.

Errors:

2023-12-06 02:12:20 Traceback (most recent call last):
2023-12-06 02:12:20   File "/home/retupmoc/PycharmProjects/EffortScraper/app/dimeFetch   /AssistFetch.py", line 18, in <module>
2023-12-06 02:12:20     driver = webdriver.Firefox(service=service, options=options)
2023-12-06 02:12:20              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2023-12-06 02:12:20   File "/usr/local/lib/python3.11/site-packages/selenium/webdriver/firefox  /webdriver.py", line 67, in __init__
2023-12-06 02:12:20     super().__init__(command_executor=executor, options=options)
2023-12-06 02:12:20   File "/usr/local/lib/python3.11/site-packages/selenium/webdriver/remote/webdriver.py", line 208, in __init__
2023-12-06 02:12:20     self.start_session(capabilities)
2023-12-06 02:12:20   File "/usr/local/lib/python3.11/site-packages/selenium/webdriver/remote/webdriver.py", line 292, in start_session
2023-12-06 02:12:20     response = self.execute(Command.NEW_SESSION, caps)["value"]
2023-12-06 02:12:20                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2023-12-06 02:12:20   File "/usr/local/lib/python3.11/site-packages/selenium/webdriver/remote/webdriver.py", line 347, in execute
2023-12-06 02:12:20     self.error_handler.check_response(response)
2023-12-06 02:12:20   File "/usr/local/lib/python3.11/site-packages/selenium/webdriver/remote/errorhandler.py", line 229, in check_response
2023-12-06 02:12:20     raise exception_class(message, screen, stacktrace)
2023-12-06 02:12:20 selenium.common.exceptions.WebDriverException: Message: Process unexpectedly closed with status 255        

Script:

 import os
import csv
import time
import numpy as np
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.options import Options as FirefoxOptions

# Set up the Firefox options and WebDriver
service = FirefoxService()
options = FirefoxOptions()
options.add_argument('-headless')  # Uncomment if you run in headless mode
options.add_argument("--window-size=1920x,1080")
driver = webdriver.Firefox(service=service, options=options)

# Open the webpage
driver.get('https://www.nba.com/stats/players/passing?LastNGames=1&dir=D&sort=POTENTIAL_AST')

# Wait for the table data to load
print('1')
table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'nba-stats-content-block')))
print('2')

headers = table.find_elements(By.TAG_NAME, 'th')
print('3')

header_titles = [header.text for header in headers]
print("Logski")
print(header_titles)

# Replacing redundant "AST" with actual header name
header_titles[9] = "SecondaryAssists"
header_titles[10] = "PotentialAssists"
header_titles[11] = "AssistPointsCreated"
header_titles[12] = "AdjustedAssists"
header_titles[13] = "AstToPass%"
header_titles[14] = "AdjAstToPass%"

xbutton = WebDriverWait(driver,3).until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".onetrust-close-btn-handler.onetrust-close-btn-ui.banner-close-button.ot-close-icon")))
xbutton.click()


# Conditional assignments
if len(header_titles) > 9:
    header_titles[9] = "SecondaryAssists"
if len(header_titles) > 10:
    header_titles[10] = "PotentialAssists"
if len(header_titles) > 11:
    header_titles[11] = "AssistPointsCreated"

# Sort the table by clicking on the table header (sorting by rebounds)
time.sleep(np.random.uniform(3.80,6.0))
header_to_sort = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//th[text()="AST"]')))
time.sleep(np.random.uniform(1.0,1.45))
header_to_sort.click()
time.sleep(np.random.uniform(4.0,5.35))  # Replace with a more reliable wait condition

# Locate the dropdown by the known prefix of the class name
dropdown_prefix = "DropDown_select"  # This is the consistent prefix of the class name
dropdown = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, f"//select[starts-with(@class, '{dropdown_prefix}')]"))
)

# Select the 'All' option by its value
all_option_value = "-1"
all_option = dropdown.find_element(By.XPATH, f"//option[@value='{all_option_value}']")
all_option.click()

# Get the current date and time for the filename
current_time = datetime.now()
timestamp = current_time.strftime("%Y%m%d_%H%M%S")

#save .csv files to archive folder
directory = "AssistsArchive"
if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory '{directory}' created")

# Open a new CSV file with the timestamp in the name to save the data
filename = os.path.join(directory, f'passing_data_{timestamp}.csv')
table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'nba-stats-content-block')))

with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header_titles)  # Write the headers to the CSV
    rows = table.find_elements(By.TAG_NAME, 'tr')
    for row in rows:
        # Get all the columns for the row
        cols = row.find_elements(By.TAG_NAME, 'td')
        # Write the columns to the CSV
        writer.writerow([col.text for col in cols])

# Close the driver
driver.quit()



DockerFile:

# Use an official Python runtime as a parent image
FROM python:3.11.6

# Set the working directory to /app
WORKDIR /home/retupmoc/PycharmProjects/EffortScraper/app

# Copy the contents of the PyCharmProjects directory into the container at /app
COPY . .

# Install any needed packages specified in requirements.txt
RUN pip install --upgrade pip 
&& pip install -r requirements.txt

# Remove existing geckodriver from PATH if present
RUN rm /usr/local/bin/geckodriver || true

# Install GeckoDriver
RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-    linux64.tar.gz 
&& tar -zxvf geckodriver-v0.33.0-linux64.tar.gz 
&& mv geckodriver /usr/local/bin/ 
&& rm geckodriver-v0.33.0-linux64.tar.gz

# Define environment variable
ENV NAME comboFetch

# Run dimeFetch and boundFetch scripts by default when the container launches
CMD ["python", "dimeFetch/AssistFetch.py"]

3

Answers


  1. It looks like the core part of your error is this:

    2023-12-06 02:12:20 selenium.common.exceptions.WebDriverException: Message: Process unexpectedly closed with status 255
    

    And doing a quick search for that error brings me to this other post:

    Selenium py – Process unexpectedly closed with status 255

    So I’d check there for a solution.

    Login or Signup to reply.
  2. Better solution is to not use Selenium. The data is returned in a nice json format.

    import requests
    import pandas as pd
    
    
    url = 'https://stats.nba.com/stats/leaguedashptstats'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'referer': 'https://www.nba.com/'}
    payload = {
        'LastNGames': '1',
        'LeagueID': '00',
        'Location': '',
        'Month': '0',
        'OpponentTeamID': '0',
        'Outcome': '',
        'PORound': '0',
        'PerMode': 'PerGame',
        'PlayerExperience': '',
        'PlayerOrTeam': 'Player',
        'PlayerPosition': '',
        'PtMeasureType': 'Passing',
        'Season': '2023-24',
        'SeasonSegment': '',
        'SeasonType': 'Regular Season',
        'StarterBench': '',
        'TeamID': '0'}
    
    jsonData = requests.get(url, headers=headers, params=payload, timeout=10).json()
    data = jsonData['resultSets'][0]
    df = pd.DataFrame(data['rowSet'], columns=data['headers'])
    

    Output:

    1st 5 rows of 263

    print(df.head().to_string())
       PLAYER_ID    PLAYER_NAME     TEAM_ID TEAM_ABBREVIATION  GP  W  L   MIN  PASSES_MADE  PASSES_RECEIVED  AST  FT_AST  SECONDARY_AST  POTENTIAL_AST  AST_POINTS_CREATED  AST_ADJ  AST_TO_PASS_PCT  AST_TO_PASS_PCT_ADJ
    0    1630639    A.J. Lawson  1610612742               DAL   1  1  0   6.7          4.0              4.0  0.0     0.0            0.0            0.0                 0.0      0.0            0.000                0.000
    1     203932   Aaron Gordon  1610612743               DEN   1  0  1  31.3         28.0             21.0  2.0     0.0            0.0            4.0                 6.0      2.0            0.071                0.071
    2    1628988  Aaron Holiday  1610612745               HOU   1  1  0  26.9         42.0             47.0  4.0     0.0            0.0            6.0                12.0      4.0            0.095                0.095
    3    1630598  Aaron Wiggins  1610612760               OKC   1  0  1  15.5         13.0             13.0  0.0     0.0            1.0            3.0                 0.0      1.0            0.000                0.077
    4     201143     Al Horford  1610612738               BOS   1  0  1  31.4         23.0             17.0  4.0     0.0            0.0            7.0                12.0      4.0            0.174                0.174
    
    Login or Signup to reply.
  3. Try to install chrome in the docker container:

    # Use an official Python runtime as a parent image
    FROM python:3.11.6
    
    # Set the working directory to /app
    WORKDIR /home/retupmoc/PycharmProjects/EffortScraper/app
    
    # Get chrome
    RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -
    RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list'
    RUN apt-get -y update
    # Install
    RUN apt-get install -y google-chrome-stable
    RUN apt-get install -yqq unzip
    RUN wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip
    RUN unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
    ENV DISPLAY=:99
    
    
    
    # Copy the contents of the PyCharmProjects directory into the container at /app
    COPY . .
    
    # Install any needed packages specified in requirements.txt
    RUN pip install --upgrade pip 
    && pip install -r requirements.txt
    
    # Remove existing geckodriver from PATH if present
    RUN rm /usr/local/bin/geckodriver || true
    
    # ... rest of your code
    

    And use it like this in the script:

    # Selenium imports
    from selenium.webdriver.chrome.options import Options
    from selenium import webdriver
    
    def set_chrome_options():
        """
          Sets chrome options for Selenium.
          Chrome options for headless browser is enabled.
        """
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_prefs = {}
        chrome_options.experimental_options["prefs"] = chrome_prefs
        chrome_prefs["profile.default_content_settings"] = { "images": 2 }
        return chrome_options
    
    browser = webdriver.Chrome(options=set_chrome_options())
    
    browser.get("some-url")
    
    rows = browser.find_elements_by_xpath("your-xpath selector")
    
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search