Html - I keep getting a internal server error for my flask app for a web scraper

AE13
January 13, 2024
188 views
0 votes
2 Answers

from flask import Flask, render_template
from bs4 import BeautifulSoup
import requests
import pandas as pd

app = Flask(__name__)

@app.route("/")
def job_scraper():
    url01 = "https://www.linkedin.com/jobs/search?keywords=Python%20%28Programming%20Language%29&location=Las%20Vegas%2C%20Nevada%2C%20United%20States&geoId=100293800&currentJobId=3477751834&position=7&pageNum=1"
    
    url_request_01 = requests.get(url01)
    soup = BeautifulSoup(url_request_01.text, 'html.parser')
    
    job_title_pull = soup.find_all(class_="base-search-card__title")
    job_subtitle_pull = soup.find_all(class_="base-search-card__subtitle")
    job_location_pull = soup.find_all(class_="job-search-card__location")
    job_date_pull = soup.find_all(class_="job-search-card__listdate")
    job_links = [job_link_pull['href'] for job_link_pull in soup.find_all('a', href=True, class_="base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]")]
    
    job_title_data = [title.text.strip() for title in job_title_pull]
    job_subtitle_data = [subtitle.text.strip() for subtitle in job_subtitle_pull]
    job_location_data = [location.text.strip() for location in job_location_pull]
    job_date_data = [date.text.strip() for date in job_date_pull]

    data = {'title': job_title_data,
            'subtitle': job_subtitle_data,
            'location': job_location_data,
            'date': job_date_data,
            'link': job_links}

    
    df = pd.DataFrame(data)
    return render_template('index.html', table_html=df.to_html(classes='table table-striped', index=False))

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=5001)

here is my html code

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Your Web App</title>
</head>
<body>
    <h1>Your Data</h1>
    {{ table_html | safe }}
</body>
</html>

Here is the error

applemacbookproa2289@APPLEs-MacBook-Pro-2 web-scraping-01 % python3 scraping.py
 * Serving Flask app 'scraping'
 * Debug mode: off
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5001
 * Running on http://192.168.4.53:5001
Press CTRL+C to quit
[2024-01-12 10:03:36,006] ERROR in app: Exception on / [GET]
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 1455, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 869, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 867, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 852, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/applemacbookproa2289/Desktop/web-scraping-01/scraping.py", line 33, in job_scraper
    df = pd.DataFrame(data)
         ^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/frame.py", line 733, in __init__
    mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 503, in dict_to_mgr
    return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 114, in arrays_to_mgr
    index = _extract_index(arrays)
            ^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 677, in _extract_index
    raise ValueError("All arrays must be of the same length")
ValueError: All arrays must be of the same length
192.168.4.53 - - [12/Jan/2024 10:03:36] "GET / HTTP/1.1" 500 -
[2024-01-12 10:04:22,869] ERROR in app: Exception on / [GET]
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 1455, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 869, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 867, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 852, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/applemacbookproa2289/Desktop/web-scraping-01/scraping.py", line 33, in job_scraper
    df = pd.DataFrame(data)
         ^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/frame.py", line 733, in __init__
    mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 503, in dict_to_mgr
    return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 114, in arrays_to_mgr
    index = _extract_index(arrays)
            ^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 677, in _extract_index
    raise ValueError("All arrays must be of the same length")
ValueError: All arrays must be of the same length
192.168.4.53 - - [12/Jan/2024 10:04:22] "GET / HTTP/1.1" 500 -

I tried a bunch of random solution already, but I can’t figure out whats wrong. May you please point me in the right direction, as I am just testing this out. Once I am able to get the scrapped data to html, the next step for me is to take user input Job title and location to pull the data something similar to this.

jobtitle = input("Enter job title: ")
location = input("Enter job location: ")

url01 = "https://www.linkedin.com/jobs/search?keywords="+jobtitle+"&location="+location+"&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"

Answers

Chosen as BEST ANSWER

I was able to solve the issue here is my solution

from flask import Flask, render_template
    from bs4 import BeautifulSoup
    import requests
    import pandas as pd
    
    app = Flask(__name__)
    @app.route("/")
    def job_scraper():
        url01 = "https://www.linkedin.com/jobs/search?keywords=Python%20%28Programming%20Language%29&location=Las%20Vegas%2C%20Nevada%2C%20United%20States&geoId=100293800&currentJobId=3477751834&position=7&pageNum=1"
        
        url_request_01 = requests.get(url01)
        soup = BeautifulSoup(url_request_01.text, 'html.parser')
        
        job_title_pull = soup.find_all(class_="base-search-card__title")
        job_subtitle_pull = soup.find_all(class_="base-search-card__subtitle")
        job_location_pull = soup.find_all(class_="job-search-card__location")
        job_date_pull = soup.find_all(class_="job-search-card__listdate")
        job_links = [job_link_pull['href'] for job_link_pull in soup.find_all('a', href=True, class_="base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]")]
        
        job_title_data = [title.text.strip() for title in job_title_pull]
        job_subtitle_data = [subtitle.text.strip() for subtitle in job_subtitle_pull]
        job_location_data = [location.text.strip() for location in job_location_pull]
        job_date_data = [
        d.text.strip() if (d := job.select_one(".job-search-card__listdate")) else None
        for job in soup.select(".job-search-card")]
        data = {'title': job_title_data,
            'subtitle': job_subtitle_data,
            'location': job_location_data,
            'link': job_date_data,
            'date': job_date_data}
        df = pd.DataFrame(data)
        df_html = df.to_html(table_id="table")
        
        
        return render_template('index.html', df=df, df_html = df_html)
        
    app.run(host="0.0.0.0", port=5001)

Here is the code for HTML to loop through the data frame created in python

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Your Web App</title>
</head>
<body>
    <table>
        <tr>
            <th>Title</th> <!-- Close the th tag properly -->
        </tr>
        {% for cell in df.title %}
        <tr>
            <td> {{ cell }} </td>
        </tr>
        {% endfor %}
    </table>
    <h1>Your Data1</h1>
</body>
</html>

(Edit)

The problem seems that not every job has element with class="job-search-card__listdate" so you need to check for that:

import pandas as pd
import requests
from bs4 import BeautifulSoup


url01 = "https://www.linkedin.com/jobs/search?keywords=Python%20%28Programming%20Language%29&location=Las%20Vegas%2C%20Nevada%2C%20United%20States&geoId=100293800&currentJobId=3477751834&position=7&pageNum=1"

url_request_01 = requests.get(url01)
soup = BeautifulSoup(url_request_01.text, "html.parser")

job_title_pull = soup.find_all(class_="base-search-card__title")
job_subtitle_pull = soup.find_all(class_="base-search-card__subtitle")
job_location_pull = soup.find_all(class_="job-search-card__location")
job_date_pull = soup.find_all(class_="job-search-card__listdate")
job_links = [
    job_link_pull["href"]
    for job_link_pull in soup.find_all(
        "a",
        href=True,
        class_="base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]",
    )
]

job_title_data = [title.text.strip() for title in job_title_pull]
job_subtitle_data = [subtitle.text.strip() for subtitle in job_subtitle_pull]
job_location_data = [location.text.strip() for location in job_location_pull]

job_date_data = [
    d.text.strip() if (d := job.select_one(".job-search-card__listdate")) else None
    for job in soup.select(".job-search-card")
]

data = {
    "title": job_title_data,
    "subtitle": job_subtitle_data,
    "location": job_location_data,
    "date": job_date_data,
    "link": job_links,
}

df = pd.DataFrame(data)
print(df)

Prints:

                                                                     title                                        subtitle       location          date                                                                                                                                                                                                                                                                              link
0                            Junior/Entry Level Software Developer(Remote)                                   SynergisticIT  Las Vegas, NV   1 month ago                            https://www.linkedin.com/jobs/view/junior-entry-level-software-developer-remote-at-synergisticit-3767588842?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=gBXODO%2FGuULT55Q6dJzufw%3D%3D&position=1&pageNum=0&trk=public_jobs_jserp-result_search-card
1                                     Junior Software Development Engineer  Team Remotely Inc: Talent Solution Reimagined!  Las Vegas, NV          None  https://www.linkedin.com/jobs/view/junior-software-development-engineer-at-team-remotely-inc-talent-solution-reimagined%21-3805052007?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=FXvU%2FHLIKDYVFT6Pjv8miA%3D%3D&position=2&pageNum=0&trk=public_jobs_jserp-result_search-card
2                                           Backend Developer(Entry Level)                                   SynergisticIT  Las Vegas, NV   3 weeks ago                                             https://www.linkedin.com/jobs/view/backend-developer-entry-level-at-synergisticit-3784401022?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=BzXsFKHN1CCduvAj2TWNNA%3D%3D&position=3&pageNum=0&trk=public_jobs_jserp-result_search-card
3                                                 Junior Software Engineer                                   SynergisticIT  Las Vegas, NV   1 month ago                                                https://www.linkedin.com/jobs/view/junior-software-engineer-at-synergisticit-3767595083?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=v6IbHxai%2Bjh7sUI6kl0PLA%3D%3D&position=4&pageNum=0&trk=public_jobs_jserp-result_search-card
4                             Data Scientist(Rermote) - Junior/Entry Level                                   SynergisticIT  Las Vegas, NV    1 week ago                               https://www.linkedin.com/jobs/view/data-scientist-rermote-junior-entry-level-at-synergisticit-3792903797?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=s%2BTFVT3a5IxYwJ4UGrhHdg%3D%3D&position=5&pageNum=0&trk=public_jobs_jserp-result_search-card

...

Please signup or login to give your own answer.

Click here to cancel reply.

Html – I keep getting a internal server error for my flask app for a web scraper

Answers