Web Scraping des annonces classées avec Python

Comment déterminer un bon prix lorsqu'on vend ou achète un article de seconde main?
J'ai écrit un script qui collecte les données de toutes les annonces sur Kijiji Quebec et les sauvegarde dans une base de données SQLITE. Les données peuvent ensuite être analysées par catégorie, gamme de prix, et autres attributs en utilisant Excel ou Python.
Voici un aperçu du code:
        import requests
import sqlite3
from datetime import datetime
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup

# general purpose code that helped me bypass errors when requesting web pages
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

# Scrape Kijiji for new ads and collect ad data in the database
def get_new_ads():
    start_time = datetime.now()
    print("get_new_ads start, it is currently:",start_time)
    db = sqlite3.connect('kijiji.db')
    cursor = db.cursor()
    
    # get the existing ad_codes to verify if an ad has already been scraped
    cursor.execute("""select ad_code from ads""")
    existing_ad_codes = cursor.fetchall()    
    existing_ad_codes = [i[0] for i in existing_ad_codes]
    
    # match_count for when ads have already been scraped, if too many successive duplicates, stop the script
    match_count = 0
    new_ads = []
    url1 = "https://www.kijiji.ca/b-quebec/page-"
    url2 = "/l9001"
    
    for x in range(1,100): # max page 100
        url_page = url1 + str(x) + url2
        print("page: " + str(x))        
        r = session.get(url_page)
        soup = BeautifulSoup(r.content, "html.parser")        
        ad_list_on_page = soup.find_all('div', {"class": "regular-ad"})
        if match_count >= 100:
            print("match_count >= 100")
            break
    
        # scraping data from ad_thumbnail on search result page
        for item in ad_list_on_page:
            # filtering out third party sponsored ads
            if "third-party" not in item.attrs["class"]:
                try:
                    link = item.find("a", {"class": "title"})
                    link = "https://www.kijiji.ca" + link["href"]
#                    print("link: " + link + " type: " + str(type(link)))
                except:
                    continue
    
                title = item.find("div", {"class": "title"})
                title = title.contents[1].text.strip()
#                print("title: " + title + " type: " + str(type(title)))
                
                location = item.find("div", {"class": "location"})
                city = location.contents[0].strip()
#                print("city: " + city + " type: " + str(type(city)))
                
                # scraping data from the individual ad page
                url_page = link
                r = session.get(url_page)
                soup = BeautifulSoup(r.content, "html.parser")
    
                # ad_code
                try:
                    ad_code = int(soup.select('a[class*="adId"]')[0].text)
#                    print("ad_code: " + str(ad_code) + " type: " + str(type(ad_code)))
                    
                    # checking if ad_code in DB
                    if ad_code in existing_ad_codes:
                        print("ad_code in existing_ad_codes, continue... timestamp: {}".format(datetime.now().strftime("%X")))
                        # if this happens too much, it means that we've already scraped these ads, no need to keep going too much
                        match_count += 1
#                        print("match count: " + str(match_count))
                        continue
                    else:                    
    #                    print("No match, pass and keep scraping the ad...")
                        match_count = 0
                        pass
                    
                except:
#                    print("could not retrieve ad_code, continue to next ad")
                    continue
                
                # publication datetime
                try:                    
                    publish_datetime = soup.find("time")['datetime']
#                    print("publish_datetime: " + publish_datetime + " type: " + str(type(publish_datetime)))
                except:
#                    print("could not retrieve publish_datetime, set to null")
                    publish_datetime = None
    
                # price
                try:
                    price = float(soup.find("span", {"itemprop": "price"})['content'])
                except:
                    try:
                        price = float(soup.select('span[class*="currentPrice"]')[0].text
                                      .replace(' ','').replace('$','').replace(',','.'))
                        if price in ['Surdemande', '']:
                            price = None
                    except:
#                        print("could not retrieve preformat_price, set to null")
                        price = None
                if price in ["Gratuit", "Échange"]:
                    price = 0
                try:
                    if " " in price:
                        price = price.replace(" ", "")
                except:
                    pass
#                print("price: " + str(price) + " type: " + str(type(price)))
                        
                # description
                try:
                    description = soup.find("div",{"itemprop": "description"}).text.replace("\n", " ")
#                    print("description: " + str(description) + " type: " + str(type(description)))
                
                except:
#                    print("could not retrieve description, set to null")
                    description = None
                    
                # address
                try:
                    address = soup.select('span[class*="address"]')[0].text
#                    print("address: " + str(address) + " type: " + str(type(address)))
                except:
#                    print("could not retrieve address, set to null")
                    address = None
                    
                # latitude
                try:
                    latitude = float(soup.find("meta", {"property":"og:latitude"})['content'])
#                    print("latitude: " + str(latitude) + " type: " + str(type(latitude)))
                except:
#                    print("could not retrieve latitude, set to null")
                    latitude = None
                    
                # longitude
                try:
                    longitude = float(soup.find("meta", {"property":"og:longitude"})['content'])
#                    print("longitude: " + str(longitude) + " type: " + str(type(longitude)))
                except:
#                    print("could not retrieve longitude, set to null")
                    longitude = None
                
                # ad_category
                # breadcrumbs rules:
                    # when there's 4 elements in the list, category is the 2nd and sub the 3rd
                    # when there's more than 4, cat is the 4th, sub vthe 5th
                        # except when the 4th is Acheter et vendre, cat is the 5th, sub the 6th
                        
                try:
                    crumb_elements = soup.select('li[class*="crumbItem"]')
#                    for crumb in crumb_elements:
#                        print(crumb.text)
                        
                    if len(crumb_elements) == 4:
                        if crumb_elements[2].text == 'Acheter et vendre':
                            try:
                                ad_category = crumb_elements[3].text
                            except:
                                ad_category = None
                            try:
                                ad_subcategory = crumb_elements[4].text
                            except:
                                ad_subcategory = None
                        elif crumb_elements[1].text == 'Immobilier': # immobilier
                            ad_category = crumb_elements[1].text
                            ad_subcategory = crumb_elements[2].text
                        else:
                            ad_category = crumb_elements[2].text
                            ad_subcategory = crumb_elements[3].text
                    else:
                        if crumb_elements[3].text == 'Acheter et vendre':
                            try:
                                ad_category = crumb_elements[4].text
                            except:
                                ad_category = None
                            try:
                                ad_subcategory = crumb_elements[5].text
                            except:
                                ad_subcategory = None
                        else:
                            try:
                                ad_category = crumb_elements[3].text
                            except:
                                ad_category = None
                            try:
                                ad_subcategory = crumb_elements[4].text
                            except:
                                ad_subcategory = None
                    try:       
                        if 'à' in ad_category:                            
                            ad_category = ad_category[:ad_category.rfind('à')].strip()
                    except:
                        pass
                    try:
                        if 'à' in ad_subcategory:                            
                            ad_subcategory = ad_subcategory[:ad_subcategory.rfind('à')].strip()
                    except:
                        pass
                    
#                    print("ad_category: " + str(ad_category) + " type: " + str(type(ad_category)))
#                    print("ad_subcategory: " + str(ad_subcategory) + " type: " + str(type(ad_subcategory)))
                except:
#                    print("could not retrieve ad_category, set to null")
                    ad_category = None
                    ad_subcategory = None
               
                # visit count
                try:
                    visit_count = int(soup.select('div[class*="visitCounter"]')[0].text.split()[0])                    
#                    print("visit_count: " + str(visit_count) + " type: " + str(type(visit_count)))
                except:
#                    print("could not retrieve preformat_visit_count, set to null")                    
                    visit_count = None
                    
                # photo_count
                
#                main_photo = soup.find_all('div', {"class*": "heroImageContainer"})
                main_photo = soup.select('div[class*="heroImageContainer"]')
                if main_photo == []:
                    photo_count = 0
                else:
                    photo_count = 1
                
                #THUMBNAILS
                t1 = soup.select('div[class*="thumbnailOne"]')
                t2 = soup.select('div[class*="thumbnailTwo"]')
                t3 = soup.select('div[class*="thumbnailThree"]')
                t3e = soup.select('div[class*="noBorderThumbnail"]')
                
                thumbnails = [t1, t2, t3]
                
                for thumb in thumbnails:
                    if thumb == []:
                        pass
                    else:
                        photo_count += 1
                
                if t3e == []:
                    pass
                else:
                    photo_count -= 1                        
                    
                #LAST THUMBNAIL, GIVING A +PHOTOS
                more_images = soup.select('div[class*="moreImages"]')
                
                if more_images == []:
                    pass
                else:
                    for images in more_images:
                        images = images.contents[0]
                        images = int(images.replace('+',''))
                    photo_count = photo_count + images - 1
    
                # making a tuple from the scraped data, in preparation to inserting in database
                insert_values = (title, price, description, address, link, ad_code, publish_datetime, city, latitude,\
                                 longitude, ad_category, ad_subcategory, visit_count, photo_count)
#                print(insert_values)
                new_ads.append(insert_values)
                existing_ad_codes.append(ad_code)
                
    cursor.executemany("""INSERT OR IGNORE INTO ads(title,price,description,address,link,ad_code,publish_datetime,city,latitude,\
                      longitude,ad_category,ad_subcategory,visit_count,photo_count) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", new_ads)
    db.commit()    
    db.close()
    
    print("get_new_ads done... inserting {} new ads in DB".format(len(new_ads)))
    duration = datetime.now() - start_time
    print("It took:",duration)

if __name__=="__main__":
    get_new_ads()
labrecquev.ca

Web Scraping des annonces classées avec Python