Unable to scrape items using scrapy - Html

WX1505
March 28, 2023
167 views
0 votes
2 Answers

I am trying to webscrape the name, price, and description of products listed on an online shop. The website link is https://eshop.nomin.mn/n-foods.html

When I look through the HTML code of the page, I get the relevant div class containers but when I reference it in my code as such, I get no values when I run my spider. I think one reason would be if the website is Javascript based and is dynamic which would require me to use Splash. However, I don’t think this is the case for my issue.

def parse(self, response, **kwargs):
    cards = response.xpath('//div[@class="item-itemmainroot-1lZ"]')

    # parse details
    for card in cards:
        price = card.xpath(".//a[contains(@class, 'item-nameLenght-K5Z item-name-3TH')]/span()/text()").extract()

Full Code:

import scrapy
import re


class TempSpider(scrapy.Spider):
    name = 'temp_spider'
    allowed_domains = ['https://eshop.nomin.mn/']
    start_urls = ['https://eshop.nomin.mn/n-foods.html']

    def parse(self, response, **kwargs):
        cards = response.xpath('//div[@class="item-itemmainroot-1lZ"]')

        # parse details
        for card in cards:
            price = card.xpath(".//a[contains(@class, 'item-nameLenght-K5Z item-name-3TH')]/span()/text()").extract()
            
            item = {'price': price
                    }
            yield item


  [1]: https://i.stack.imgur.com/iokmo.png

All and any help is greatly appreciated. I can’t seem to figure out what I am doing wrong.

Answers

Use json.

import requests
import json
import pandas as pd
from bs4 import BeautifulSoup as bs

url =  'https://eshop.nomin.mn/graphql?query=query+category%28%24pageSize%3AInt%21%24currentPage%3AInt%21%24filters%3AProductAttributeFilterInput%21%24sort%3AProductAttributeSortInput%29%7Bproducts%28pageSize%3A%24pageSize+currentPage%3A%24currentPage+filter%3A%24filters+sort%3A%24sort%29%7Bitems%7Bid+name+sku+brand+salable_qty+brand_name+c21_available+c21_business_type+c21_reference+c21_street+c21_area+c21_bed_room+mp_daily_deal%7Bcreated_at+date_from+date_to+deal_id+deal_price+remaining_time+deal_qty+discount_label+is_featured+product_id+product_name+product_sku+sale_qty+status+store_ids+updated_at+__typename%7Dnew_to_date+short_description%7Bhtml+__typename%7DproductAttributes%7Bname+value+__typename%7Dprice%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7Dspecial_price+special_to_date+thumbnail%7Bfile_small+url+__typename%7Durl_key+url_suffix+mp_label_data%7Benabled+name+priority+label_template+label_image+to_date+__typename%7D...on+ConfigurableProduct%7Bvariants%7Bproduct%7Bsku+special_price+price%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7D__typename%7D__typename%7D__typename%7D__typename%7Dpage_info%7Btotal_pages+__typename%7Dtotal_count+__typename%7D%7D&operationName=category&variables=%7B%22currentPage%22%3A1%2C%22id%22%3A24175%2C%22filters%22%3A%7B%22category_id%22%3A%7B%22in%22%3A%2224175%22%7D%7D%2C%22pageSize%22%3A50%2C%22sort%22%3A%7B%22position%22%3A%22DESC%22%7D%7D'



 r = requests.get(url)
json_data = json.loads(r.text)
data_docs = json_data['data']['products']['items']

df = pd.DataFrame.from_dict(data_docs)
print(df)

Use the websites data api instead of the website url that you visit in your browser. It will return a json object that has all the information you are looking for.

import scrapy
import re


class TempSpider(scrapy.Spider):
    name = 'temp_spider'
    allowed_domains = ['https://eshop.nomin.mn/']
    start_urls = ['https://eshop.nomin.mn/graphql?query=query+category($pageSize:Int!$currentPage:Int!$filters:ProductAttributeFilterInput!$sort:ProductAttributeSortInput){products(pageSize:$pageSize+currentPage:$currentPage+filter:$filters+sort:$sort){items{id+name+sku+brand+salable_qty+brand_name+c21_available+c21_business_type+c21_reference+c21_street+c21_area+c21_bed_room+mp_daily_deal{created_at+date_from+date_to+deal_id+deal_price+remaining_time+deal_qty+discount_label+is_featured+product_id+product_name+product_sku+sale_qty+status+store_ids+updated_at+__typename}new_to_date+short_description{html+__typename}productAttributes{name+value+__typename}price{regularPrice{amount{currency+value+__typename}__typename}__typename}special_price+special_to_date+thumbnail{file_small+url+__typename}url_key+url_suffix+mp_label_data{enabled+name+priority+label_template+label_image+to_date+__typename}...on+ConfigurableProduct{variants{product{sku+special_price+price{regularPrice{amount{currency+value+__typename}__typename}__typename}__typename}__typename}__typename}__typename}page_info{total_pages+__typename}total_count+__typename}}&operationName=category&variables={"currentPage":1,"id":24175,"filters":{"category_id":{"in":"24175"}},"pageSize":50,"sort":{"position":"DESC"}}']

    def parse(self, response, **kwargs):
        data = response.json()
        print(data.keys())
        for item in data['data']["products"]["items"]:
            yield {
                "name": item["name"],
                "price": item["price"]["regularPrice"]["amount"]["value"]
            }

Partial OUTPUT


{'name': 'Хиам Аялал кг', 'price': 19559}
{'name': 'Чихэр Княжеские 1кг', 'price': 24859}
{'name': 'Жимсний чанамал Mr', 'price': 11999}
{'name': 'Vit C ', 'price': 28799}
{'name': 'Жүүс Моя семья', 'price': 3629}
{'name': 'Муурны ялгадас шингээх', 'price': 31999}
{'name': 'Компот Vidan 920гр', 'price': 8879}
{'name': 'Мөс 0.5кг 024218', 'price': 2029}
{'name': 'Өргөст хэмх Hainich', 'price': 7799}
{'name': 'Соус чилитэй 215гр', 'price': 9499}
{'name': 'Цай Ottogi улаан', 'price': 14299}
{'name': 'Цай шингэн Pfanner', 'price': 9379}
{'name': '02381088', 'price': 3179}
{'name': 'Өглөөний хоол G&G', 'price': 8239}
{'name': '02S003167', 'price': 7699}
{'name': '02S003133', 'price': 8299}
{'name': 'Кофе Жокей империал', 'price': 14279}
{'name': 'Жүүс Pfanner orange', 'price': 13129}
{'name': 'Цуу улаан дарсны', 'price': 6939}
{'name': 'Оливын тос Borges', 'price': 14749}
{'name': 'Оливын тос classic', 'price': 33629}
{'name': 'Оливын тос Borges', 'price': 18629}
{'name': 'Гоймон Borges Fusilli', 'price': 5939}
{'name': 'Цай шингэн чавганы', 'price': 2469}
{'name': 'Гоймон Нүүдэл 500гр', 'price': 3759}
{'name': 'Муурны хоол 85гр', 'price': 1889}
{'name': 'Бэлэн Карри зөөлөн', 'price': 7499}
{'name': 'Цай Dr.Baatar 2гр*16ш', 'price': 11999}
{'name': 'Нухаш Urbanek ', 'price': 6979}
{'name': 'Вандуй лууван холимог', 'price': 5899}
{'name': 'Өргөст хэмх Bagro', 'price': 13499}
{'name': 'Бэлэн хоол Samyang', 'price': 6189}
{'name': 'Жүүс Naturalis apple', 'price': 1589}
{'name': 'Жүүс Naturalis Apple-grape', 'price': 5999}
{'name': 'Жүүс Naturalis Apple-sour', 'price': 5999}
{'name': 'Жүүс Vita Pomegranate', 'price': 3659}
{'name': 'Шоколад Luna 33гр', 'price': 1499}
{'name': 'Жүүс Фруктовый Сад', 'price': 5999}
{'name': 'Жүүс Фруктовый Сад', 'price': 5299}
{'name': 'Жүүс Фруктовый Сад', 'price': 5299}
{'name': 'Жүүс Фруктовый Сад', 'price': 5299}
{'name': 'Жүүс Фруктовый Сад', 'price': 5299}

You can find the url for the api in the network tab in your browsers devtools…

Please signup or login to give your own answer.

Click here to cancel reply.

Unable to scrape items using scrapy – Html

Answers