Defining a method for scraping the web page
Tue May 21 2024 15:24:08 GMT+0000 (Coordinated Universal Time)
Saved by @Uncoverit #python
# Defining a method for scraping the web page def scrape_perfume_data(url): headers = { "User-Agent": "Insert user agent" } page = requests.get(url, headers=headers) page.raise_for_status() soup = BeautifulSoup(page.content, 'html.parser') perfume_name = brand_name = release_year = overall_rating = rating_count = perfumer = scent_rating = longevity_rating = sillage_rating = bottle_rating = value_rating = scraping_date = None try: perfume_name = soup.find('h1', class_='p_name_h1', itemprop='name').get_text().strip().split('\n')[0].strip() except AttributeError: pass try: brand_span = soup.find('span', itemprop='brand') brand_name = brand_span.find('span', itemprop='name').get_text().strip() except AttributeError: pass try: year = soup.find('a', href=lambda href: href and 'Release_Years' in href) release_year = year.get_text().strip() except AttributeError: pass try: overall_rating = soup.find('span', itemprop='ratingValue').get_text().strip() except AttributeError: pass try: rating_count = soup.find('span', itemprop='ratingCount').get_text().strip() except AttributeError: pass try: perfumer = soup.find('div', {'class': 'w-100 mt-0-5 mb-3'}).get_text().strip() except AttributeError: pass try: top_notes = soup.find('div', class_='pyramid_block nb_t w-100 mt-2') top_note_list = [span.get_text(strip=True) for span in top_notes.find_all('span', class_='clickable_note_img')] except AttributeError: pass try: heart_notes = soup.find('div', class_='pyramid_block nb_m w-100 mt-2') heart_note_list = [span.get_text(strip=True) for span in heart_notes.find_all('span', class_='clickable_note_img')] except AttributeError: pass try: base_notes = soup.find('div', class_='pyramid_block nb_b w-100 mt-2') base_note_list = [span.get_text(strip=True) for span in base_notes.find_all('span', class_='clickable_note_img')] except AttributeError: pass scraping_date = datetime.date.today() return { 'PerfumeName': perfume_name, 'Brand': brand_name, 'ReleaseYear': release_year, 'OverallRating': overall_rating, 'RatingCount': rating_count, 'Perfumer': perfumer, 'TopNotes': top_note_list if 'top_note_list' in locals() else None, 'HeartNotes': heart_note_list if 'heart_note_list' in locals() else None, 'BaseNotes': base_note_list if 'base_note_list' in locals() else None, }
Comments