skip to Main Content
from flask import Flask, render_template
from bs4 import BeautifulSoup
import requests
import pandas as pd

app = Flask(__name__)

@app.route("/")
def job_scraper():
    url01 = "https://www.linkedin.com/jobs/search?keywords=Python%20%28Programming%20Language%29&location=Las%20Vegas%2C%20Nevada%2C%20United%20States&geoId=100293800&currentJobId=3477751834&position=7&pageNum=1"
    
    url_request_01 = requests.get(url01)
    soup = BeautifulSoup(url_request_01.text, 'html.parser')
    
    job_title_pull = soup.find_all(class_="base-search-card__title")
    job_subtitle_pull = soup.find_all(class_="base-search-card__subtitle")
    job_location_pull = soup.find_all(class_="job-search-card__location")
    job_date_pull = soup.find_all(class_="job-search-card__listdate")
    job_links = [job_link_pull['href'] for job_link_pull in soup.find_all('a', href=True, class_="base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]")]
    
    job_title_data = [title.text.strip() for title in job_title_pull]
    job_subtitle_data = [subtitle.text.strip() for subtitle in job_subtitle_pull]
    job_location_data = [location.text.strip() for location in job_location_pull]
    job_date_data = [date.text.strip() for date in job_date_pull]

    data = {'title': job_title_data,
            'subtitle': job_subtitle_data,
            'location': job_location_data,
            'date': job_date_data,
            'link': job_links}

    
    df = pd.DataFrame(data)
    return render_template('index.html', table_html=df.to_html(classes='table table-striped', index=False))

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=5001)

here is my html code

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Your Web App</title>
</head>
<body>
    <h1>Your Data</h1>
    {{ table_html | safe }}
</body>
</html>


Here is the error

applemacbookproa2289@APPLEs-MacBook-Pro-2 web-scraping-01 % python3 scraping.py
 * Serving Flask app 'scraping'
 * Debug mode: off
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5001
 * Running on http://192.168.4.53:5001
Press CTRL+C to quit
[2024-01-12 10:03:36,006] ERROR in app: Exception on / [GET]
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 1455, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 869, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 867, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 852, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/applemacbookproa2289/Desktop/web-scraping-01/scraping.py", line 33, in job_scraper
    df = pd.DataFrame(data)
         ^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/frame.py", line 733, in __init__
    mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 503, in dict_to_mgr
    return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 114, in arrays_to_mgr
    index = _extract_index(arrays)
            ^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 677, in _extract_index
    raise ValueError("All arrays must be of the same length")
ValueError: All arrays must be of the same length
192.168.4.53 - - [12/Jan/2024 10:03:36] "GET / HTTP/1.1" 500 -
[2024-01-12 10:04:22,869] ERROR in app: Exception on / [GET]
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 1455, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 869, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 867, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/flask/app.py", line 852, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/applemacbookproa2289/Desktop/web-scraping-01/scraping.py", line 33, in job_scraper
    df = pd.DataFrame(data)
         ^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/frame.py", line 733, in __init__
    mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 503, in dict_to_mgr
    return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 114, in arrays_to_mgr
    index = _extract_index(arrays)
            ^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/internals/construction.py", line 677, in _extract_index
    raise ValueError("All arrays must be of the same length")
ValueError: All arrays must be of the same length
192.168.4.53 - - [12/Jan/2024 10:04:22] "GET / HTTP/1.1" 500 -

I tried a bunch of random solution already, but I can’t figure out whats wrong. May you please point me in the right direction, as I am just testing this out. Once I am able to get the scrapped data to html, the next step for me is to take user input Job title and location to pull the data something similar to this.

jobtitle = input("Enter job title: ")
location = input("Enter job location: ")

url01 = "https://www.linkedin.com/jobs/search?keywords="+jobtitle+"&location="+location+"&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"

2

Answers


  1. Chosen as BEST ANSWER

    I was able to solve the issue here is my solution

    from flask import Flask, render_template
        from bs4 import BeautifulSoup
        import requests
        import pandas as pd
        
        app = Flask(__name__)
        @app.route("/")
        def job_scraper():
            url01 = "https://www.linkedin.com/jobs/search?keywords=Python%20%28Programming%20Language%29&location=Las%20Vegas%2C%20Nevada%2C%20United%20States&geoId=100293800&currentJobId=3477751834&position=7&pageNum=1"
            
            url_request_01 = requests.get(url01)
            soup = BeautifulSoup(url_request_01.text, 'html.parser')
            
            job_title_pull = soup.find_all(class_="base-search-card__title")
            job_subtitle_pull = soup.find_all(class_="base-search-card__subtitle")
            job_location_pull = soup.find_all(class_="job-search-card__location")
            job_date_pull = soup.find_all(class_="job-search-card__listdate")
            job_links = [job_link_pull['href'] for job_link_pull in soup.find_all('a', href=True, class_="base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]")]
            
            job_title_data = [title.text.strip() for title in job_title_pull]
            job_subtitle_data = [subtitle.text.strip() for subtitle in job_subtitle_pull]
            job_location_data = [location.text.strip() for location in job_location_pull]
            job_date_data = [
            d.text.strip() if (d := job.select_one(".job-search-card__listdate")) else None
            for job in soup.select(".job-search-card")]
            data = {'title': job_title_data,
                'subtitle': job_subtitle_data,
                'location': job_location_data,
                'link': job_date_data,
                'date': job_date_data}
            df = pd.DataFrame(data)
            df_html = df.to_html(table_id="table")
            
            
            return render_template('index.html', df=df, df_html = df_html)
            
        app.run(host="0.0.0.0", port=5001)
    

    Here is the code for HTML to loop through the data frame created in python

    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Your Web App</title>
    </head>
    <body>
        <table>
            <tr>
                <th>Title</th> <!-- Close the th tag properly -->
            </tr>
            {% for cell in df.title %}
            <tr>
                <td> {{ cell }} </td>
            </tr>
            {% endfor %}
        </table>
        <h1>Your Data1</h1>
    </body>
    </html>
    

  2. The problem seems that not every job has element with class="job-search-card__listdate" so you need to check for that:

    import pandas as pd
    import requests
    from bs4 import BeautifulSoup
    
    
    url01 = "https://www.linkedin.com/jobs/search?keywords=Python%20%28Programming%20Language%29&location=Las%20Vegas%2C%20Nevada%2C%20United%20States&geoId=100293800&currentJobId=3477751834&position=7&pageNum=1"
    
    url_request_01 = requests.get(url01)
    soup = BeautifulSoup(url_request_01.text, "html.parser")
    
    job_title_pull = soup.find_all(class_="base-search-card__title")
    job_subtitle_pull = soup.find_all(class_="base-search-card__subtitle")
    job_location_pull = soup.find_all(class_="job-search-card__location")
    job_date_pull = soup.find_all(class_="job-search-card__listdate")
    job_links = [
        job_link_pull["href"]
        for job_link_pull in soup.find_all(
            "a",
            href=True,
            class_="base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]",
        )
    ]
    
    job_title_data = [title.text.strip() for title in job_title_pull]
    job_subtitle_data = [subtitle.text.strip() for subtitle in job_subtitle_pull]
    job_location_data = [location.text.strip() for location in job_location_pull]
    
    job_date_data = [
        d.text.strip() if (d := job.select_one(".job-search-card__listdate")) else None
        for job in soup.select(".job-search-card")
    ]
    
    data = {
        "title": job_title_data,
        "subtitle": job_subtitle_data,
        "location": job_location_data,
        "date": job_date_data,
        "link": job_links,
    }
    
    df = pd.DataFrame(data)
    print(df)
    

    Prints:

                                                                         title                                        subtitle       location          date                                                                                                                                                                                                                                                                              link
    0                            Junior/Entry Level Software Developer(Remote)                                   SynergisticIT  Las Vegas, NV   1 month ago                            https://www.linkedin.com/jobs/view/junior-entry-level-software-developer-remote-at-synergisticit-3767588842?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=gBXODO%2FGuULT55Q6dJzufw%3D%3D&position=1&pageNum=0&trk=public_jobs_jserp-result_search-card
    1                                     Junior Software Development Engineer  Team Remotely Inc: Talent Solution Reimagined!  Las Vegas, NV          None  https://www.linkedin.com/jobs/view/junior-software-development-engineer-at-team-remotely-inc-talent-solution-reimagined%21-3805052007?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=FXvU%2FHLIKDYVFT6Pjv8miA%3D%3D&position=2&pageNum=0&trk=public_jobs_jserp-result_search-card
    2                                           Backend Developer(Entry Level)                                   SynergisticIT  Las Vegas, NV   3 weeks ago                                             https://www.linkedin.com/jobs/view/backend-developer-entry-level-at-synergisticit-3784401022?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=BzXsFKHN1CCduvAj2TWNNA%3D%3D&position=3&pageNum=0&trk=public_jobs_jserp-result_search-card
    3                                                 Junior Software Engineer                                   SynergisticIT  Las Vegas, NV   1 month ago                                                https://www.linkedin.com/jobs/view/junior-software-engineer-at-synergisticit-3767595083?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=v6IbHxai%2Bjh7sUI6kl0PLA%3D%3D&position=4&pageNum=0&trk=public_jobs_jserp-result_search-card
    4                             Data Scientist(Rermote) - Junior/Entry Level                                   SynergisticIT  Las Vegas, NV    1 week ago                               https://www.linkedin.com/jobs/view/data-scientist-rermote-junior-entry-level-at-synergisticit-3792903797?refId=6e6Ii6kYrj9B0Fhpi6T3ag%3D%3D&trackingId=s%2BTFVT3a5IxYwJ4UGrhHdg%3D%3D&position=5&pageNum=0&trk=public_jobs_jserp-result_search-card
    
    ...
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search