skip to Main Content

I am making a website and I the website has a scraper that scrapes another website. On my website I want to have this counter (that shows which page is currently being scraper, so that the user knows everything is running smooth)

#---------------------------------------------- scarper.py

# ... rest of the code

counter = 0

while True:
    counter += 1
    print(f'Extracting infromation from page: {counter}')

#... rest of the code

to be shown on my website (instead of console log) in between these two buttons. How can I achieve this?

#---------------------------------------------- scrape.html

#... rest of the code

<button type="submit" class="btn btn-primary">Start Scraping</button>
<a href="{{ url_for('view_database') }}" class="btn btn-info">View Database</a>

#... rest of the code

The code for the route and its function is the following:

@app.route('/scrape', methods=['GET', 'POST'])
def scrape():
    form = ScrapingForm()
    if form.validate_on_submit():
        city = request.form.get('city')
        subregion = request.form.get('subregion')
        apart_or_house = request.form.get('apart_or_house')
        words_to_check = request.form.get('words_to_check')  # Retrieve words to check input

        if city and apart_or_house:
            g.scraping_finished = False
            threading.Thread(target=run_scraper, args=(city, subregion, apart_or_house, words_to_check)).start() 
            flash('Scraping started!', 'success')
        else:
            flash('Please fill all required fields.', 'error')

    if g.get('scraping_finished', False):
        flash('Scraper has finished!', 'info')

    return render_template('scrape.html', form=form)


@app.route('/scraping-finished')
def scraping_finished():
    return render_template('scraping_finished.html')


def run_scraper(city, subregion, apart_or_house, words_to_check):
    # Ask the user for input
    base_url = ""
    while True:
        if city != "":
            break
        else:
            flash("City name cannot be empty. Please enter a valid city name.", "error")
            return

    while True:
        if apart_or_house == "wohnung" or apart_or_house == 'haus':
            break
        elif apart_or_house == "":
            flash("This field cannot be empty. Please enter what are you buying.", "error")
        else:
            flash("Please enter either 'wohnung' or 'haus'.", "error")
            return

    if subregion:
        base_url = f"https://www.immobilienscout24.de/Suche/de/{city}/{city}/{subregion}/{apart_or_house}-kaufen"
    else:
        base_url = f"https://www.immobilienscout24.de/Suche/de/{city}/{city}/{apart_or_house}-kaufen"

    # Run the scraper script with the provided inputs and base_url
    subprocess.run(['python', 'scraper.py', city, subregion, apart_or_house, base_url, words_to_check])

    with current_app.test_request_context():
        return redirect(url_for('scraping_finished'))

2

Answers


  1. One solution is with SSR and creating an event on the client.

    Add an element to display the counter value in your scrape.html:

    <button type="submit" class="btn btn-primary">Start Scraping</button>
        <span id="counter">0</span>
        <a href="/" class="btn btn-info">View Database</a>
    
        <script>
            var source = new EventSource("{{ url_for('scrape') }}");
            source.onmessage = function(event) {
                document.getElementById('counter').innerHTML = event.data;
            };
        </script>
    

    Change your scrape.py to:

    app = Flask(__name__)
    app.config['SECRET_KEY'] = 'your-secret-key-for-csrf'
    
    
    class ScrapingForm(FlaskForm):
        city = StringField('City', validators=[DataRequired()])
        subregion = StringField('Subregion')
        apart_or_house = SelectField('Apartment or House', choices=[('apartment', 'Apartment'), ('house', 'House')],
                                     validators=[DataRequired()])
        words_to_check = StringField('Words to Check')
        submit = SubmitField('Start Scraping')
    
    
    def run_scraper(city, subregion, apart_or_house, words_to_check):
        counter = 0
        while True:
            counter += 1
            print(f'Extracting information from page: {counter}')
            yield f'data: {counter}nn'
    
    
    @app.route('/', methods=['GET', 'POST'])
    def scrape():
        form = ScrapingForm()
        if form.validate_on_submit():
            city = request.form.get('city')
            subregion = request.form.get('subregion')
            apart_or_house = request.form.get('apart_or_house')
            words_to_check = request.form.get('words_to_check')
    
            if city and apart_or_house:
                return Response(stream_with_context(run_scraper(city, subregion, apart_or_house, words_to_check)),
                                mimetype='text/event-stream')
            else:
                flash('Please fill all required fields.', 'error')
    
        return render_template('scrape.html', form=form)
    
    
    if __name__ == '__main__':
        app.run(debug=True)
    

    I also replaced the thread with stream_with_context, and removed the g.
    The counter value is updated on document.getElementById('counter').innerHTML = event.data.

    Login or Signup to reply.
  2. On your html:
    Counter value: <div id="counter"></div>

    On your js file:

    function updateCounter() {
                fetch('/get_counter').then(response => response.json())
                    .then(data => {
                        document.getElementById('counter').innerText = data.count;
                    });
            }
            setInterval(updateCounter, 1000);#time the updates as needed
    

    On your flask backend:

    @app.route('/get_counter')
    def get_count():
        counter = #your logic for getting the counter count running on backend
        return jsonify({'count': counter})
    

    You can use ajax for this ,however it will be a constant stream of packets for no reason. A better approach would be running it on JS on client side or using a button to retrive it from server as and when needed.

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search