skip to Main Content

We have a vendor that sells many products that we want to include. they unfortunately do not have an API setup to retrieve information. they do however give us product lists that show just the SKU of an item. I want to log in, got to the page that holds the product, scrape specific info from that page and move on to the next. I want to do this line by line, one at a time since their website is fragile and prone to going offline.

basically this spider should:

  1. log in to website
  2. Load a csv
  3. recursively set row of csv to target URL
  4. scrape data
  5. load next url
  6. repeat steps 3-5 until end of csv

import scrapy
from scrapy import Spider
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
import csv



class SPIDER_NAME (Spider):
    name = 'RPLogin_Final'
    start_urls = ['WEBSITEURL']
    
    def parse(self, response):
        token = response.css("form input[name=__RequestVerificationToken]::attr(value)").extract_first()
        
        
        return FormRequest(url=self.start_urls[0],
                                        formdata={'__RequestVerificationToken': token,
                                                    'upw': 'PASSWORD',
                                                    'uid': 'USERNAME'},
                                        callback=self.scrape_now)
    def scrape_now(self, response):
        print("logged in!")
        # do stuff / go to next page
        with open ('partsList.csv','r') as csv_file:
            csv_reader = csv.reader(csv_file)
            for row in csv_reader:
                print(row)
                yield scrapy.Request(url = row , callback=self.parse_product)
                

    def parse_product(self, response):
        product = response.css('div.row.jsCartContainer.product-list-item')
        yield{
            'Name' : product.css("p.jplist-text-filter::text").get(),
            'Part_Num' : product.css("a.jplist-text-filter.jplist-item-num::text").get(),
            'Purchase_Price' : product.css("li.jplist-item-price.bold::text").get(),
            'Suggested_Retail' : product.css("li:nth-child(2)").get(),
            'In_Stock' : product.css("li:nth-child(5)").get(),
            'Image_Link' : product.css("img").get()
            }

        open_in_browser(response)

This spider worked fine when:

url = 'string_URL'

then I changed this to a csv file with:

with open('partsList.csv') as file:
    url=[line.strip() for line in file]

and i received these errors:

logged in!
['URLS']

  File "/home/partsales/Desktop/Python/parts-env/lib/python3.10/site-packages/scrapy/http/request/__init__.py", line 133, in _set_url
    raise TypeError(f"Request url must be str, got {type(url).__name__}")
TypeError: Request url must be str, got list

2

Answers


  1. Chosen as BEST ANSWER

    So by using karel van dongen's suggestion, but replacing the csv file with a txt file, I was able to get this to execute perfectly!

    complete (redacted) code below:

    import scrapy
    from scrapy import Spider
    from scrapy import Request
    from scrapy.http import FormRequest
    from scrapy.utils.response import open_in_browser
    import csv
    
    
    
    class PartsSpider(Spider):
        name = 'productSpider'
        start_urls = ['WEBSITE URL']
    
        
        
            
        
        def parse(self, response):
            token = response.css("form input[name=__RequestVerificationToken]::attr(value)").extract_first()
            
            
            return FormRequest(url=self.start_urls[0],
                                            formdata={'__RequestVerificationToken': token,
                                                        #
                                                        #Change USER AND PASSWORD
                                                        #
                                                        'upw': 'ENCODED PASSWORD',
                                                        'uid': 'USERNAME'},
                                            callback=self.scrape_now)
        def scrape_now(self, response):
            print("logged in!")
            #
            # CHANGE FILE NAME
            #
            with open('partlist.txt') as partslist:
                for line in partslist:
                    curr_url =line.rstrip()
                    print(curr_url)
                    yield Request(url = curr_url, callback = self.parse_product)
            
    
        def parse_product(self, response):
            product = response.css('div.row.jsCartContainer.product-list-item')
            yield{
                #
                # CHANGE BRAND
                #
                'Brand' : 'BRAND',
                'Name' : product.css("p.jplist-text-filter::text").get(),
                'Part_Num' : product.css("a.jplist-text-filter.jplist-item-num::text").get().replace(' ',''),
                'Purchase_Price' : product.css("li.jplist-item-price.bold::text").get().replace('Your Price:','').replace(' / EA',''),
                'Suggested_Retail' : product.css("li:nth-child(2)").get().replace('<li>Suggested Retail Price: ','').replace('</li>',''),
                'In_Stock' : product.css("li:nth-child(5)").get().replace('<li class="jplist-in-stock">Available: ','').replace('</li>',''),
                'Image_Link' : product.css("img").get().replace("<img class="product-img" src="","https://WEBSITE.com").replace('"','')[:-16]
                }
    

  2. Hi would begin with seeing what below does.

    import csv
    with open(file_path, 'r') as csv_file:
            csv_reader = csv.reader(csv_file)
            for row in csv_reader:
                # Process each row here
                print(row)
    

    And please blackout login info in the screenshot…. And just add the code in a codeblock

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search