L'espace de Vincent sur le web
Comment déterminer un bon prix lorsqu'on vend ou achète un article de seconde main?
J'ai écrit un script qui collecte les données de toutes les annonces sur Kijiji Quebec et les sauvegarde dans une base de données SQLITE. Les données peuvent ensuite être analysées par catégorie, gamme de prix, et autres attributs en utilisant Excel ou Python.
Voici un aperçu du code:
import requests
import sqlite3
from datetime import datetime
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
# general purpose code that helped me bypass errors when requesting web pages
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
# Scrape Kijiji for new ads and collect ad data in the database
def get_new_ads():
start_time = datetime.now()
print("get_new_ads start, it is currently:",start_time)
db = sqlite3.connect('kijiji.db')
cursor = db.cursor()
# get the existing ad_codes to verify if an ad has already been scraped
cursor.execute("""select ad_code from ads""")
existing_ad_codes = cursor.fetchall()
existing_ad_codes = [i[0] for i in existing_ad_codes]
# match_count for when ads have already been scraped, if too many successive duplicates, stop the script
match_count = 0
new_ads = []
url1 = "https://www.kijiji.ca/b-quebec/page-"
url2 = "/l9001"
for x in range(1,100): # max page 100
url_page = url1 + str(x) + url2
print("page: " + str(x))
r = session.get(url_page)
soup = BeautifulSoup(r.content, "html.parser")
ad_list_on_page = soup.find_all('div', {"class": "regular-ad"})
if match_count >= 100:
print("match_count >= 100")
break
# scraping data from ad_thumbnail on search result page
for item in ad_list_on_page:
# filtering out third party sponsored ads
if "third-party" not in item.attrs["class"]:
try:
link = item.find("a", {"class": "title"})
link = "https://www.kijiji.ca" + link["href"]
# print("link: " + link + " type: " + str(type(link)))
except:
continue
title = item.find("div", {"class": "title"})
title = title.contents[1].text.strip()
# print("title: " + title + " type: " + str(type(title)))
location = item.find("div", {"class": "location"})
city = location.contents[0].strip()
# print("city: " + city + " type: " + str(type(city)))
# scraping data from the individual ad page
url_page = link
r = session.get(url_page)
soup = BeautifulSoup(r.content, "html.parser")
# ad_code
try:
ad_code = int(soup.select('a[class*="adId"]')[0].text)
# print("ad_code: " + str(ad_code) + " type: " + str(type(ad_code)))
# checking if ad_code in DB
if ad_code in existing_ad_codes:
print("ad_code in existing_ad_codes, continue... timestamp: {}".format(datetime.now().strftime("%X")))
# if this happens too much, it means that we've already scraped these ads, no need to keep going too much
match_count += 1
# print("match count: " + str(match_count))
continue
else:
# print("No match, pass and keep scraping the ad...")
match_count = 0
pass
except:
# print("could not retrieve ad_code, continue to next ad")
continue
# publication datetime
try:
publish_datetime = soup.find("time")['datetime']
# print("publish_datetime: " + publish_datetime + " type: " + str(type(publish_datetime)))
except:
# print("could not retrieve publish_datetime, set to null")
publish_datetime = None
# price
try:
price = float(soup.find("span", {"itemprop": "price"})['content'])
except:
try:
price = float(soup.select('span[class*="currentPrice"]')[0].text
.replace(' ','').replace('$','').replace(',','.'))
if price in ['Surdemande', '']:
price = None
except:
# print("could not retrieve preformat_price, set to null")
price = None
if price in ["Gratuit", "Échange"]:
price = 0
try:
if " " in price:
price = price.replace(" ", "")
except:
pass
# print("price: " + str(price) + " type: " + str(type(price)))
# description
try:
description = soup.find("div",{"itemprop": "description"}).text.replace("\n", " ")
# print("description: " + str(description) + " type: " + str(type(description)))
except:
# print("could not retrieve description, set to null")
description = None
# address
try:
address = soup.select('span[class*="address"]')[0].text
# print("address: " + str(address) + " type: " + str(type(address)))
except:
# print("could not retrieve address, set to null")
address = None
# latitude
try:
latitude = float(soup.find("meta", {"property":"og:latitude"})['content'])
# print("latitude: " + str(latitude) + " type: " + str(type(latitude)))
except:
# print("could not retrieve latitude, set to null")
latitude = None
# longitude
try:
longitude = float(soup.find("meta", {"property":"og:longitude"})['content'])
# print("longitude: " + str(longitude) + " type: " + str(type(longitude)))
except:
# print("could not retrieve longitude, set to null")
longitude = None
# ad_category
# breadcrumbs rules:
# when there's 4 elements in the list, category is the 2nd and sub the 3rd
# when there's more than 4, cat is the 4th, sub vthe 5th
# except when the 4th is Acheter et vendre, cat is the 5th, sub the 6th
try:
crumb_elements = soup.select('li[class*="crumbItem"]')
# for crumb in crumb_elements:
# print(crumb.text)
if len(crumb_elements) == 4:
if crumb_elements[2].text == 'Acheter et vendre':
try:
ad_category = crumb_elements[3].text
except:
ad_category = None
try:
ad_subcategory = crumb_elements[4].text
except:
ad_subcategory = None
elif crumb_elements[1].text == 'Immobilier': # immobilier
ad_category = crumb_elements[1].text
ad_subcategory = crumb_elements[2].text
else:
ad_category = crumb_elements[2].text
ad_subcategory = crumb_elements[3].text
else:
if crumb_elements[3].text == 'Acheter et vendre':
try:
ad_category = crumb_elements[4].text
except:
ad_category = None
try:
ad_subcategory = crumb_elements[5].text
except:
ad_subcategory = None
else:
try:
ad_category = crumb_elements[3].text
except:
ad_category = None
try:
ad_subcategory = crumb_elements[4].text
except:
ad_subcategory = None
try:
if 'à' in ad_category:
ad_category = ad_category[:ad_category.rfind('à')].strip()
except:
pass
try:
if 'à' in ad_subcategory:
ad_subcategory = ad_subcategory[:ad_subcategory.rfind('à')].strip()
except:
pass
# print("ad_category: " + str(ad_category) + " type: " + str(type(ad_category)))
# print("ad_subcategory: " + str(ad_subcategory) + " type: " + str(type(ad_subcategory)))
except:
# print("could not retrieve ad_category, set to null")
ad_category = None
ad_subcategory = None
# visit count
try:
visit_count = int(soup.select('div[class*="visitCounter"]')[0].text.split()[0])
# print("visit_count: " + str(visit_count) + " type: " + str(type(visit_count)))
except:
# print("could not retrieve preformat_visit_count, set to null")
visit_count = None
# photo_count
# main_photo = soup.find_all('div', {"class*": "heroImageContainer"})
main_photo = soup.select('div[class*="heroImageContainer"]')
if main_photo == []:
photo_count = 0
else:
photo_count = 1
#THUMBNAILS
t1 = soup.select('div[class*="thumbnailOne"]')
t2 = soup.select('div[class*="thumbnailTwo"]')
t3 = soup.select('div[class*="thumbnailThree"]')
t3e = soup.select('div[class*="noBorderThumbnail"]')
thumbnails = [t1, t2, t3]
for thumb in thumbnails:
if thumb == []:
pass
else:
photo_count += 1
if t3e == []:
pass
else:
photo_count -= 1
#LAST THUMBNAIL, GIVING A +PHOTOS
more_images = soup.select('div[class*="moreImages"]')
if more_images == []:
pass
else:
for images in more_images:
images = images.contents[0]
images = int(images.replace('+',''))
photo_count = photo_count + images - 1
# making a tuple from the scraped data, in preparation to inserting in database
insert_values = (title, price, description, address, link, ad_code, publish_datetime, city, latitude,\
longitude, ad_category, ad_subcategory, visit_count, photo_count)
# print(insert_values)
new_ads.append(insert_values)
existing_ad_codes.append(ad_code)
cursor.executemany("""INSERT OR IGNORE INTO ads(title,price,description,address,link,ad_code,publish_datetime,city,latitude,\
longitude,ad_category,ad_subcategory,visit_count,photo_count) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", new_ads)
db.commit()
db.close()
print("get_new_ads done... inserting {} new ads in DB".format(len(new_ads)))
duration = datetime.now() - start_time
print("It took:",duration)
if __name__=="__main__":
get_new_ads()