Labrecquev
Français

labrecquev.ca

Vincent's home on the web


Scraping classified ads data using Python


How to price items when selling them second hand?

I wrote a script that would collect all ads data from Kijiji Quebec and store it inside a SQLITE database. The data could then be analysed per categeory, price range, and other attributes using Excel or Python.

Here's an overview of the code:

        import requests
import sqlite3
from datetime import datetime
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup

# general purpose code that helped me bypass errors when requesting web pages
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

# Scrape Kijiji for new ads and collect ad data in the database
def get_new_ads():
    start_time = datetime.now()
    print("get_new_ads start, it is currently:",start_time)
    db = sqlite3.connect('kijiji.db')
    cursor = db.cursor()
    
    # get the existing ad_codes to verify if an ad has already been scraped
    cursor.execute("""select ad_code from ads""")
    existing_ad_codes = cursor.fetchall()    
    existing_ad_codes = [i[0] for i in existing_ad_codes]
    
    # match_count for when ads have already been scraped, if too many successive duplicates, stop the script
    match_count = 0
    new_ads = []
    url1 = "https://www.kijiji.ca/b-quebec/page-"
    url2 = "/l9001"
    
    for x in range(1,100): # max page 100
        url_page = url1 + str(x) + url2
        print("page: " + str(x))        
        r = session.get(url_page)
        soup = BeautifulSoup(r.content, "html.parser")        
        ad_list_on_page = soup.find_all('div', {"class": "regular-ad"})
        if match_count >= 100:
            print("match_count >= 100")
            break
    
        # scraping data from ad_thumbnail on search result page
        for item in ad_list_on_page:
            # filtering out third party sponsored ads
            if "third-party" not in item.attrs["class"]:
                try:
                    link = item.find("a", {"class": "title"})
                    link = "https://www.kijiji.ca" + link["href"]
#                    print("link: " + link + " type: " + str(type(link)))
                except:
                    continue
    
                title = item.find("div", {"class": "title"})
                title = title.contents[1].text.strip()
#                print("title: " + title + " type: " + str(type(title)))
                
                location = item.find("div", {"class": "location"})
                city = location.contents[0].strip()
#                print("city: " + city + " type: " + str(type(city)))
                
                # scraping data from the individual ad page
                url_page = link
                r = session.get(url_page)
                soup = BeautifulSoup(r.content, "html.parser")
    
                # ad_code
                try:
                    ad_code = int(soup.select('a[class*="adId"]')[0].text)
#                    print("ad_code: " + str(ad_code) + " type: " + str(type(ad_code)))
                    
                    # checking if ad_code in DB
                    if ad_code in existing_ad_codes:
                        print("ad_code in existing_ad_codes, continue... timestamp: {}".format(datetime.now().strftime("%X")))
                        # if this happens too much, it means that we've already scraped these ads, no need to keep going too much
                        match_count += 1
#                        print("match count: " + str(match_count))
                        continue
                    else:                    
    #                    print("No match, pass and keep scraping the ad...")
                        match_count = 0
                        pass
                    
                except:
#                    print("could not retrieve ad_code, continue to next ad")
                    continue
                
                # publication datetime
                try:                    
                    publish_datetime = soup.find("time")['datetime']
#                    print("publish_datetime: " + publish_datetime + " type: " + str(type(publish_datetime)))
                except:
#                    print("could not retrieve publish_datetime, set to null")
                    publish_datetime = None
    
                # price
                try:
                    price = float(soup.find("span", {"itemprop": "price"})['content'])
                except:
                    try:
                        price = float(soup.select('span[class*="currentPrice"]')[0].text
                                      .replace(' ','').replace('$','').replace(',','.'))
                        if price in ['Surdemande', '']:
                            price = None
                    except:
#                        print("could not retrieve preformat_price, set to null")
                        price = None
                if price in ["Gratuit", "Échange"]:
                    price = 0
                try:
                    if " " in price:
                        price = price.replace(" ", "")
                except:
                    pass
#                print("price: " + str(price) + " type: " + str(type(price)))
                        
                # description
                try:
                    description = soup.find("div",{"itemprop": "description"}).text.replace("\n", " ")
#                    print("description: " + str(description) + " type: " + str(type(description)))
                
                except:
#                    print("could not retrieve description, set to null")
                    description = None
                    
                # address
                try:
                    address = soup.select('span[class*="address"]')[0].text
#                    print("address: " + str(address) + " type: " + str(type(address)))
                except:
#                    print("could not retrieve address, set to null")
                    address = None
                    
                # latitude
                try:
                    latitude = float(soup.find("meta", {"property":"og:latitude"})['content'])
#                    print("latitude: " + str(latitude) + " type: " + str(type(latitude)))
                except:
#                    print("could not retrieve latitude, set to null")
                    latitude = None
                    
                # longitude
                try:
                    longitude = float(soup.find("meta", {"property":"og:longitude"})['content'])
#                    print("longitude: " + str(longitude) + " type: " + str(type(longitude)))
                except:
#                    print("could not retrieve longitude, set to null")
                    longitude = None
                
                # ad_category
                # breadcrumbs rules:
                    # when there's 4 elements in the list, category is the 2nd and sub the 3rd
                    # when there's more than 4, cat is the 4th, sub vthe 5th
                        # except when the 4th is Acheter et vendre, cat is the 5th, sub the 6th
                        
                try:
                    crumb_elements = soup.select('li[class*="crumbItem"]')
#                    for crumb in crumb_elements:
#                        print(crumb.text)
                        
                    if len(crumb_elements) == 4:
                        if crumb_elements[2].text == 'Acheter et vendre':
                            try:
                                ad_category = crumb_elements[3].text
                            except:
                                ad_category = None
                            try:
                                ad_subcategory = crumb_elements[4].text
                            except:
                                ad_subcategory = None
                        elif crumb_elements[1].text == 'Immobilier': # immobilier
                            ad_category = crumb_elements[1].text
                            ad_subcategory = crumb_elements[2].text
                        else:
                            ad_category = crumb_elements[2].text
                            ad_subcategory = crumb_elements[3].text
                    else:
                        if crumb_elements[3].text == 'Acheter et vendre':
                            try:
                                ad_category = crumb_elements[4].text
                            except:
                                ad_category = None
                            try:
                                ad_subcategory = crumb_elements[5].text
                            except:
                                ad_subcategory = None
                        else:
                            try:
                                ad_category = crumb_elements[3].text
                            except:
                                ad_category = None
                            try:
                                ad_subcategory = crumb_elements[4].text
                            except:
                                ad_subcategory = None
                    try:       
                        if 'à' in ad_category:                            
                            ad_category = ad_category[:ad_category.rfind('à')].strip()
                    except:
                        pass
                    try:
                        if 'à' in ad_subcategory:                            
                            ad_subcategory = ad_subcategory[:ad_subcategory.rfind('à')].strip()
                    except:
                        pass
                    
#                    print("ad_category: " + str(ad_category) + " type: " + str(type(ad_category)))
#                    print("ad_subcategory: " + str(ad_subcategory) + " type: " + str(type(ad_subcategory)))
                except:
#                    print("could not retrieve ad_category, set to null")
                    ad_category = None
                    ad_subcategory = None
               
                # visit count
                try:
                    visit_count = int(soup.select('div[class*="visitCounter"]')[0].text.split()[0])                    
#                    print("visit_count: " + str(visit_count) + " type: " + str(type(visit_count)))
                except:
#                    print("could not retrieve preformat_visit_count, set to null")                    
                    visit_count = None
                    
                # photo_count
                
#                main_photo = soup.find_all('div', {"class*": "heroImageContainer"})
                main_photo = soup.select('div[class*="heroImageContainer"]')
                if main_photo == []:
                    photo_count = 0
                else:
                    photo_count = 1
                
                #THUMBNAILS
                t1 = soup.select('div[class*="thumbnailOne"]')
                t2 = soup.select('div[class*="thumbnailTwo"]')
                t3 = soup.select('div[class*="thumbnailThree"]')
                t3e = soup.select('div[class*="noBorderThumbnail"]')
                
                thumbnails = [t1, t2, t3]
                
                for thumb in thumbnails:
                    if thumb == []:
                        pass
                    else:
                        photo_count += 1
                
                if t3e == []:
                    pass
                else:
                    photo_count -= 1                        
                    
                #LAST THUMBNAIL, GIVING A +PHOTOS
                more_images = soup.select('div[class*="moreImages"]')
                
                if more_images == []:
                    pass
                else:
                    for images in more_images:
                        images = images.contents[0]
                        images = int(images.replace('+',''))
                    photo_count = photo_count + images - 1
    
                # making a tuple from the scraped data, in preparation to inserting in database
                insert_values = (title, price, description, address, link, ad_code, publish_datetime, city, latitude,\
                                 longitude, ad_category, ad_subcategory, visit_count, photo_count)
#                print(insert_values)
                new_ads.append(insert_values)
                existing_ad_codes.append(ad_code)
                
    cursor.executemany("""INSERT OR IGNORE INTO ads(title,price,description,address,link,ad_code,publish_datetime,city,latitude,\
                      longitude,ad_category,ad_subcategory,visit_count,photo_count) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", new_ads)
    db.commit()    
    db.close()
    
    print("get_new_ads done... inserting {} new ads in DB".format(len(new_ads)))
    duration = datetime.now() - start_time
    print("It took:",duration)

if __name__=="__main__":
    get_new_ads()