Vincent's home on the web
How to price items when selling them second hand?
I wrote a script that would collect all ads data from Kijiji Quebec and store it inside a SQLITE database. The data could then be analysed per categeory, price range, and other attributes using Excel or Python.
Here's an overview of the code:
import requests
import sqlite3
from datetime import datetime
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
# general purpose code that helped me bypass errors when requesting web pages
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
# Scrape Kijiji for new ads and collect ad data in the database
def get_new_ads():
start_time = datetime.now()
print("get_new_ads start, it is currently:",start_time)
db = sqlite3.connect('kijiji.db')
cursor = db.cursor()
# get the existing ad_codes to verify if an ad has already been scraped
cursor.execute("""select ad_code from ads""")
existing_ad_codes = cursor.fetchall()
existing_ad_codes = [i[0] for i in existing_ad_codes]
# match_count for when ads have already been scraped, if too many successive duplicates, stop the script
match_count = 0
new_ads = []
url1 = "https://www.kijiji.ca/b-quebec/page-"
url2 = "/l9001"
for x in range(1,100): # max page 100
url_page = url1 + str(x) + url2
print("page: " + str(x))
r = session.get(url_page)
soup = BeautifulSoup(r.content, "html.parser")
ad_list_on_page = soup.find_all('div', {"class": "regular-ad"})
if match_count >= 100:
print("match_count >= 100")
break
# scraping data from ad_thumbnail on search result page
for item in ad_list_on_page:
# filtering out third party sponsored ads
if "third-party" not in item.attrs["class"]:
try:
link = item.find("a", {"class": "title"})
link = "https://www.kijiji.ca" + link["href"]
# print("link: " + link + " type: " + str(type(link)))
except:
continue
title = item.find("div", {"class": "title"})
title = title.contents[1].text.strip()
# print("title: " + title + " type: " + str(type(title)))
location = item.find("div", {"class": "location"})
city = location.contents[0].strip()
# print("city: " + city + " type: " + str(type(city)))
# scraping data from the individual ad page
url_page = link
r = session.get(url_page)
soup = BeautifulSoup(r.content, "html.parser")
# ad_code
try:
ad_code = int(soup.select('a[class*="adId"]')[0].text)
# print("ad_code: " + str(ad_code) + " type: " + str(type(ad_code)))
# checking if ad_code in DB
if ad_code in existing_ad_codes:
print("ad_code in existing_ad_codes, continue... timestamp: {}".format(datetime.now().strftime("%X")))
# if this happens too much, it means that we've already scraped these ads, no need to keep going too much
match_count += 1
# print("match count: " + str(match_count))
continue
else:
# print("No match, pass and keep scraping the ad...")
match_count = 0
pass
except:
# print("could not retrieve ad_code, continue to next ad")
continue
# publication datetime
try:
publish_datetime = soup.find("time")['datetime']
# print("publish_datetime: " + publish_datetime + " type: " + str(type(publish_datetime)))
except:
# print("could not retrieve publish_datetime, set to null")
publish_datetime = None
# price
try:
price = float(soup.find("span", {"itemprop": "price"})['content'])
except:
try:
price = float(soup.select('span[class*="currentPrice"]')[0].text
.replace(' ','').replace('$','').replace(',','.'))
if price in ['Surdemande', '']:
price = None
except:
# print("could not retrieve preformat_price, set to null")
price = None
if price in ["Gratuit", "Échange"]:
price = 0
try:
if " " in price:
price = price.replace(" ", "")
except:
pass
# print("price: " + str(price) + " type: " + str(type(price)))
# description
try:
description = soup.find("div",{"itemprop": "description"}).text.replace("\n", " ")
# print("description: " + str(description) + " type: " + str(type(description)))
except:
# print("could not retrieve description, set to null")
description = None
# address
try:
address = soup.select('span[class*="address"]')[0].text
# print("address: " + str(address) + " type: " + str(type(address)))
except:
# print("could not retrieve address, set to null")
address = None
# latitude
try:
latitude = float(soup.find("meta", {"property":"og:latitude"})['content'])
# print("latitude: " + str(latitude) + " type: " + str(type(latitude)))
except:
# print("could not retrieve latitude, set to null")
latitude = None
# longitude
try:
longitude = float(soup.find("meta", {"property":"og:longitude"})['content'])
# print("longitude: " + str(longitude) + " type: " + str(type(longitude)))
except:
# print("could not retrieve longitude, set to null")
longitude = None
# ad_category
# breadcrumbs rules:
# when there's 4 elements in the list, category is the 2nd and sub the 3rd
# when there's more than 4, cat is the 4th, sub vthe 5th
# except when the 4th is Acheter et vendre, cat is the 5th, sub the 6th
try:
crumb_elements = soup.select('li[class*="crumbItem"]')
# for crumb in crumb_elements:
# print(crumb.text)
if len(crumb_elements) == 4:
if crumb_elements[2].text == 'Acheter et vendre':
try:
ad_category = crumb_elements[3].text
except:
ad_category = None
try:
ad_subcategory = crumb_elements[4].text
except:
ad_subcategory = None
elif crumb_elements[1].text == 'Immobilier': # immobilier
ad_category = crumb_elements[1].text
ad_subcategory = crumb_elements[2].text
else:
ad_category = crumb_elements[2].text
ad_subcategory = crumb_elements[3].text
else:
if crumb_elements[3].text == 'Acheter et vendre':
try:
ad_category = crumb_elements[4].text
except:
ad_category = None
try:
ad_subcategory = crumb_elements[5].text
except:
ad_subcategory = None
else:
try:
ad_category = crumb_elements[3].text
except:
ad_category = None
try:
ad_subcategory = crumb_elements[4].text
except:
ad_subcategory = None
try:
if 'à' in ad_category:
ad_category = ad_category[:ad_category.rfind('à')].strip()
except:
pass
try:
if 'à' in ad_subcategory:
ad_subcategory = ad_subcategory[:ad_subcategory.rfind('à')].strip()
except:
pass
# print("ad_category: " + str(ad_category) + " type: " + str(type(ad_category)))
# print("ad_subcategory: " + str(ad_subcategory) + " type: " + str(type(ad_subcategory)))
except:
# print("could not retrieve ad_category, set to null")
ad_category = None
ad_subcategory = None
# visit count
try:
visit_count = int(soup.select('div[class*="visitCounter"]')[0].text.split()[0])
# print("visit_count: " + str(visit_count) + " type: " + str(type(visit_count)))
except:
# print("could not retrieve preformat_visit_count, set to null")
visit_count = None
# photo_count
# main_photo = soup.find_all('div', {"class*": "heroImageContainer"})
main_photo = soup.select('div[class*="heroImageContainer"]')
if main_photo == []:
photo_count = 0
else:
photo_count = 1
#THUMBNAILS
t1 = soup.select('div[class*="thumbnailOne"]')
t2 = soup.select('div[class*="thumbnailTwo"]')
t3 = soup.select('div[class*="thumbnailThree"]')
t3e = soup.select('div[class*="noBorderThumbnail"]')
thumbnails = [t1, t2, t3]
for thumb in thumbnails:
if thumb == []:
pass
else:
photo_count += 1
if t3e == []:
pass
else:
photo_count -= 1
#LAST THUMBNAIL, GIVING A +PHOTOS
more_images = soup.select('div[class*="moreImages"]')
if more_images == []:
pass
else:
for images in more_images:
images = images.contents[0]
images = int(images.replace('+',''))
photo_count = photo_count + images - 1
# making a tuple from the scraped data, in preparation to inserting in database
insert_values = (title, price, description, address, link, ad_code, publish_datetime, city, latitude,\
longitude, ad_category, ad_subcategory, visit_count, photo_count)
# print(insert_values)
new_ads.append(insert_values)
existing_ad_codes.append(ad_code)
cursor.executemany("""INSERT OR IGNORE INTO ads(title,price,description,address,link,ad_code,publish_datetime,city,latitude,\
longitude,ad_category,ad_subcategory,visit_count,photo_count) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", new_ads)
db.commit()
db.close()
print("get_new_ads done... inserting {} new ads in DB".format(len(new_ads)))
duration = datetime.now() - start_time
print("It took:",duration)
if __name__=="__main__":
get_new_ads()