Ubuntu - Scrapy Spider not populating CSV and terminates early

JKEquipment
August 28, 2023
244 views
0 votes
2 Answers

We have a vendor that sells many products that we want to include. they unfortunately do not have an API setup to retrieve information. they do however give us product lists that show just the SKU of an item. I want to log in, got to the page that holds the product, scrape specific info from that page and move on to the next. I want to do this line by line, one at a time since their website is fragile and prone to going offline.

basically this spider should:

log in to website
Load a csv
recursively set row of csv to target URL
scrape data
load next url
repeat steps 3-5 until end of csv


import scrapy
from scrapy import Spider
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
import csv



class SPIDER_NAME (Spider):
    name = 'RPLogin_Final'
    start_urls = ['WEBSITEURL']
    
    def parse(self, response):
        token = response.css("form input[name=__RequestVerificationToken]::attr(value)").extract_first()
        
        
        return FormRequest(url=self.start_urls[0],
                                        formdata={'__RequestVerificationToken': token,
                                                    'upw': 'PASSWORD',
                                                    'uid': 'USERNAME'},
                                        callback=self.scrape_now)
    def scrape_now(self, response):
        print("logged in!")
        # do stuff / go to next page
        with open ('partsList.csv','r') as csv_file:
            csv_reader = csv.reader(csv_file)
            for row in csv_reader:
                print(row)
                yield scrapy.Request(url = row , callback=self.parse_product)
                

    def parse_product(self, response):
        product = response.css('div.row.jsCartContainer.product-list-item')
        yield{
            'Name' : product.css("p.jplist-text-filter::text").get(),
            'Part_Num' : product.css("a.jplist-text-filter.jplist-item-num::text").get(),
            'Purchase_Price' : product.css("li.jplist-item-price.bold::text").get(),
            'Suggested_Retail' : product.css("li:nth-child(2)").get(),
            'In_Stock' : product.css("li:nth-child(5)").get(),
            'Image_Link' : product.css("img").get()
            }

        open_in_browser(response)

This spider worked fine when:

url = 'string_URL'

then I changed this to a csv file with:

with open('partsList.csv') as file:
    url=[line.strip() for line in file]

and i received these errors:

logged in!
['URLS']

  File "/home/partsales/Desktop/Python/parts-env/lib/python3.10/site-packages/scrapy/http/request/__init__.py", line 133, in _set_url
    raise TypeError(f"Request url must be str, got {type(url).__name__}")
TypeError: Request url must be str, got list

Answers

Chosen as BEST ANSWER

So by using karel van dongen's suggestion, but replacing the csv file with a txt file, I was able to get this to execute perfectly!

complete (redacted) code below:

import scrapy
from scrapy import Spider
from scrapy import Request
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
import csv



class PartsSpider(Spider):
    name = 'productSpider'
    start_urls = ['WEBSITE URL']

    
    
        
    
    def parse(self, response):
        token = response.css("form input[name=__RequestVerificationToken]::attr(value)").extract_first()
        
        
        return FormRequest(url=self.start_urls[0],
                                        formdata={'__RequestVerificationToken': token,
                                                    #
                                                    #Change USER AND PASSWORD
                                                    #
                                                    'upw': 'ENCODED PASSWORD',
                                                    'uid': 'USERNAME'},
                                        callback=self.scrape_now)
    def scrape_now(self, response):
        print("logged in!")
        #
        # CHANGE FILE NAME
        #
        with open('partlist.txt') as partslist:
            for line in partslist:
                curr_url =line.rstrip()
                print(curr_url)
                yield Request(url = curr_url, callback = self.parse_product)
        

    def parse_product(self, response):
        product = response.css('div.row.jsCartContainer.product-list-item')
        yield{
            #
            # CHANGE BRAND
            #
            'Brand' : 'BRAND',
            'Name' : product.css("p.jplist-text-filter::text").get(),
            'Part_Num' : product.css("a.jplist-text-filter.jplist-item-num::text").get().replace(' ',''),
            'Purchase_Price' : product.css("li.jplist-item-price.bold::text").get().replace('Your Price:','').replace(' / EA',''),
            'Suggested_Retail' : product.css("li:nth-child(2)").get().replace('<li>Suggested Retail Price: ','').replace('</li>',''),
            'In_Stock' : product.css("li:nth-child(5)").get().replace('<li class="jplist-in-stock">Available: ','').replace('</li>',''),
            'Image_Link' : product.css("img").get().replace("<img class="product-img" src="","https://WEBSITE.com").replace('"','')[:-16]
            }

(Edit)

- karelvandongen
- August 25, 2023 at 10:46 pm
- 0 votes
0
Hi would begin with seeing what below does.
```
import csv
with open(file_path, 'r') as csv_file:
        csv_reader = csv.reader(csv_file)
        for row in csv_reader:
            # Process each row here
            print(row)
```
And please blackout login info in the screenshot…. And just add the code in a codeblock
Login or Signup to reply.

Please signup or login to give your own answer.

Click here to cancel reply.

Ubuntu – Scrapy Spider not populating CSV and terminates early

Answers