I have created a scrapping tool to crawl data from aliexpress website using python and selenium.
This is my script in python :
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from pymongo import MongoClient
from time import sleep
from lxml import html
import pandas as pd
import cssselect
import pymongo
import time
def scrap(subject):
start_time = time.time()
options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:UsersaichaDesktopmycodealiexpress_scrapscrapcodesmsedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText='+subject+'<ype=wholesale&SortType=default&page={}'
# baseurl = 'https://www.aliexpress.com'
for page_nb in range(1, 5):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@class="JIIxO"]//a'):
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
stars = product.xpath('.//span[@class="eXPaM"]/text()')
if stars :
stars = stars [0]
else:
stars = 'None'
nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = 'None'
supl = product.xpath('.//a[@class="ox0KZ"]/text()')
if supl:
supl = supl[0]
else:
supl = 'None'
ship_cost = product.xpath('.//span[@class="_2jcMA"]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = 'None'
product_links = product.xpath('./@href')
if product_links:
product_links = str( product_links[0])
row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
results.append(row)
# driver.close()-------Remove this code so driver is open and can open URL
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))
client = MongoClient("mongodb://localhost:27017/")
collection = client['db2']['aliex2']
data = df.to_dict(orient = 'records')
collection.insert_many(data)
print("--- %s seconds ---" % (time.time() - start_time))
driver.quit()
return
Explanation :
My code get all data from aliexpress website and store it in a Mongodb database collection to display it in datatable format using django.
The issue that I am struggling is that when I execute my code for the first time it all goes right. But when I execute it for the second time I get the data in the same collection instead of having a new collection and a new datatable.
In otherwords I want for each code execution a new collection and a new datatable.
My datatable view :
def datatable_view(request):
if request.method =='POST':
form = Scraping(request.POST)
if form.is_valid():
subject=form.cleaned_data['subject']
scrap(subject)
client = pymongo.MongoClient("mongodb://localhost:27017/")
db= client["db2"]
col = db["aliex2"]
products = col.find()
context = {'products' : products}
return render(request,'datatable.html', context)
My datatable html :
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Aliexpress Datatable </title>
<meta name="description" content="Aliexpress DataTable.">
<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.0.0/css/bootstrap.css">
<link rel="stylesheet" href="//cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css">
</head>
<body >
<div class="container">
<div class="row">
<div class="col-sm-12">
<table cellspacing="5" cellpadding="5">
<tbody>
<tr>
<td>Minimum star:</td>
<td><input type="text" id="min1" name="min1"></td>
</tr>
<tr>
<td>Maximum star:</td>
<td><input type="text" id="max1" name="max1"></td>
</tr>
<tr>
<td>Minimum price:</td>
<td><input type="text" id="min2" name="min2"></td>
</tr>
<tr>
<td>Maximum price:</td>
<td><input type="text" id="max2" name="max2"></td>
</tr>
<tr>
<td> Minimum number of orders :</td>
<td><input type="text" id="min3" name="min3"></td>
</tr>
<tr>
<td> Maximum number of orders:</td>
<td><input type="text" id="max3" name="max3"></td>
</tr>
</tbody>
</table>
<table id="scrap" class="table table-striped table-bordered" style="width:100%">
<thead>
<tr>
<th >Title</th>
<th >Price</th>
<th >Currency</th>
<th >Stars</th>
<th >Number Of Orders</th>
<th >Shipping Cost</th>
<th >Supplier</th>
<th >Product Links</th>
</thead>
<tbody>
{% for product in products %}
<tr>
<td> {{ product.Title }} </td>
<td> {{ product.Price }} </td>
<td> {{ product.Currency }} </td>
<td> {{ product.Stars }} </td>
<td> {{ product.Orders}} </td>
<td> {{ product.Shipcost }} </td>
<td> {{ product.Supplier }} </td>
<td><a href="{{product.Productlinks }}"> Click here</a></td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
<script src="https://code.jquery.com/jquery-3.5.1.js"></script>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.1.0/jquery.min.js"></script>
<script type="text/javascript" src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script>
<script>
$.fn.dataTable.ext.search.push(
function( settings, data, dataIndex ) {
var min1 = parseFloat( $('#min1').val(), 10 );
var max1 = parseFloat( $('#max1').val(), 10 );
var stars = parseFloat( data[3] ) || 0; // use data for the stars column
if ( ( isNaN( min1 ) && isNaN( max1 ) ) ||
( isNaN( min1) && stars <= max1 ) ||
( min1 <= stars && isNaN( max1 ) ) ||
( min1 <= stars && stars <= max1 ) )
{
return true;
}
return false;
}
);
$(document).ready(function() {
var table = $('#scrap').DataTable();
// Event listener to the two range filtering inputs to redraw on input
$('#min1, #max1').keyup( function() {
table.draw();
} );
} );
$.fn.dataTable.ext.search.push(
function( settings, data, dataIndex ) {
var min2 = parseFloat( $('#min2').val(), 10 );
var max2 = parseFloat( $('#max2').val(), 10 );
var price = parseFloat( data[1] ) || 0; // use data for the price column
if ( ( isNaN( min2 ) && isNaN( max2 ) ) ||
( isNaN( min2 ) && price <= max2 ) ||
( min2 <= price && isNaN( max2 ) ) ||
( min2 <= price && price <= max2 ) )
{
return true;
}
return false;
}
);
$(document).ready(function() {
var table = $('#scrap').DataTable();
// Event listener to the two range filtering inputs to redraw on input
$('#min2, #max2').keyup( function() {
table.draw();
} );
} );
$.fn.dataTable.ext.search.push(
function( settings, data, dataIndex ) {
var min3 = parseFloat( $('#min3').val(), 10 );
var max3 = parseFloat( $('#max3').val(), 10 );
var nb_o = parseFloat( data[4] ) || 0; // use data for the number of orders column
if ( ( isNaN( min3 ) && isNaN( max3 ) ) ||
( isNaN( min3 ) && nb_o <= max3 ) ||
( min3 <= nb_o && isNaN( max3 ) ) ||
( min3 <= nb_o && nb_o <= max3 ) )
{
return true;
}
return false;
}
);
$(document).ready(function() {
var table = $('#scrap').DataTable();
// Event listener to the two range filtering inputs to redraw on input
$('#min3, #max3').keyup( function() {
table.draw();
} );
} );
</script>
</body>
</html>
ps : subject is the name product that the user will enter before scraping.
I would be so grateful if you help me.
Thank you in advance !
3
Answers
You are always saving the date in the collection called
aliex2
which is the reason it is storing in the same collection. If you want to store in a new collection, everytime you scrape new data make sure you use a unique collection name instead ofaliex2
MongoDB creates new databases and collections once you try to reference them. So by changing the referenced db and/or collection at the end of your scraping run, you could write to a new db/collection.
To create a new collection for every subject, you could change the database reference in your scrap method to something like:
To use the variable reference in the datatable_view, you would have to use the subject from the POST request. Using the following snippet might brake your request, but allows you to read the crawled subject dynamically:
By watching you problem I got an Idea may it helps you:
I had put database four line out off the page_nb loop so what it will do is rows of the all page will added to result list and at last result list will be added to dataframe and to database.
datatable view and datatable HTML will Remain same
Try this simple Idea to solve your problem.