Defining a method for scraping the web page

PHOTO EMBED

Tue May 21 2024 15:24:08 GMT+0000 (Coordinated Universal Time)

Saved by @Uncoverit #python

# Defining a method for scraping the web page
def scrape_perfume_data(url):
    headers = {
        "User-Agent": "Insert user agent"
    }
    page = requests.get(url, headers=headers)
    page.raise_for_status()
    soup = BeautifulSoup(page.content, 'html.parser')
    
    perfume_name = brand_name = release_year = overall_rating = rating_count = perfumer = scent_rating = longevity_rating = sillage_rating = bottle_rating = value_rating = scraping_date = None
    
    try:
        perfume_name = soup.find('h1', class_='p_name_h1', itemprop='name').get_text().strip().split('\n')[0].strip()
    except AttributeError:
        pass
    
    try:
        brand_span = soup.find('span', itemprop='brand')
        brand_name = brand_span.find('span', itemprop='name').get_text().strip()
    except AttributeError:
        pass
    
    try:
        year = soup.find('a', href=lambda href: href and 'Release_Years' in href)
        release_year = year.get_text().strip()
    except AttributeError:
        pass
    
    try:
        overall_rating = soup.find('span', itemprop='ratingValue').get_text().strip()
    except AttributeError:
        pass
    
    try:
        rating_count = soup.find('span', itemprop='ratingCount').get_text().strip()
    except AttributeError:
        pass
    
    try:
        perfumer = soup.find('div', {'class': 'w-100 mt-0-5 mb-3'}).get_text().strip()
    except AttributeError:
        pass
    
    try:
        top_notes = soup.find('div', class_='pyramid_block nb_t w-100 mt-2')
        top_note_list = [span.get_text(strip=True) for span in top_notes.find_all('span', class_='clickable_note_img')]
    except AttributeError:
        pass
    
    try:
        heart_notes = soup.find('div', class_='pyramid_block nb_m w-100 mt-2')
        heart_note_list = [span.get_text(strip=True) for span in heart_notes.find_all('span', class_='clickable_note_img')]
    except AttributeError:
        pass
    
    try:
        base_notes = soup.find('div', class_='pyramid_block nb_b w-100 mt-2')
        base_note_list = [span.get_text(strip=True) for span in base_notes.find_all('span', class_='clickable_note_img')]
    except AttributeError:
        pass
   
    
    scraping_date = datetime.date.today()
    
    return {
        'PerfumeName': perfume_name, 
        'Brand': brand_name, 
        'ReleaseYear': release_year, 
        'OverallRating': overall_rating, 
        'RatingCount': rating_count, 
        'Perfumer': perfumer, 
        'TopNotes': top_note_list if 'top_note_list' in locals() else None, 
        'HeartNotes': heart_note_list if 'heart_note_list' in locals() else None, 
        'BaseNotes': base_note_list if 'base_note_list' in locals() else None, 
    }
content_copyCOPY