Defining a method for scraping the web page

PHOTO

Tue May 21 2024 15:24:08 GMT+0000 (Coordinated Universal Time)

Saved by @Uncoverit #python

# Defining a method for scraping the web page
def scrape_perfume_data(url):
    headers = {
        "User-Agent": "Insert user agent"
    }
    page = requests.get(url, headers=headers)
    page.raise_for_status()
    soup = BeautifulSoup(page.content, 'html.parser')
    
    perfume_name = brand_name = release_year = overall_rating = rating_count = perfumer = scent_rating = longevity_rating = sillage_rating = bottle_rating = value_rating = scraping_date = None
    
    try:
        perfume_name = soup.find('h1', class_='p_name_h1', itemprop='name').get_text().strip().split('\n')[0].strip()
    except AttributeError:
        pass
    
    try:
        brand_span = soup.find('span', itemprop='brand')
        brand_name = brand_span.find('span', itemprop='name').get_text().strip()
    except AttributeError:
        pass
    
    try:
        year = soup.find('a', href=lambda href: href and 'Release_Years' in href)
        release_year = year.get_text().strip()
    except AttributeError:
        pass
    
    try:
        overall_rating = soup.find('span', itemprop='ratingValue').get_text().strip()
    except AttributeError:
        pass
    
    try:
        rating_count = soup.find('span', itemprop='ratingCount').get_text().strip()
    except AttributeError:
        pass
    
    try:
        perfumer = soup.find('div', {'class': 'w-100 mt-0-5 mb-3'}).get_text().strip()
    except AttributeError:
        pass
    
    try:
        top_notes = soup.find('div', class_='pyramid_block nb_t w-100 mt-2')
        top_note_list = [span.get_text(strip=True) for span in top_notes.find_all('span', class_='clickable_note_img')]
    except AttributeError:
        pass
    
    try:
        heart_notes = soup.find('div', class_='pyramid_block nb_m w-100 mt-2')
        heart_note_list = [span.get_text(strip=True) for span in heart_notes.find_all('span', class_='clickable_note_img')]
    except AttributeError:
        pass
    
    try:
        base_notes = soup.find('div', class_='pyramid_block nb_b w-100 mt-2')
        base_note_list = [span.get_text(strip=True) for span in base_notes.find_all('span', class_='clickable_note_img')]
    except AttributeError:
        pass
   
    
    scraping_date = datetime.date.today()
    
    return {
        'PerfumeName': perfume_name, 
        'Brand': brand_name, 
        'ReleaseYear': release_year, 
        'OverallRating': overall_rating, 
        'RatingCount': rating_count, 
        'Perfumer': perfumer, 
        'TopNotes': top_note_list if 'top_note_list' in locals() else None, 
        'HeartNotes': heart_note_list if 'heart_note_list' in locals() else None, 
        'BaseNotes': base_note_list if 'base_note_list' in locals() else None, 
    }

Save snippets that work from anywhere online with our extensions

Available in the Chrome Web Store

Get Firefox Add-on

Get VS Code extension

Comments

More like this

Parfumo web scraping

@Uncoverit

Defining a method for scraping the web page Defining a method for collecting the urls we want to scrape This method should parse through every page within the new release directory Collecting the urls we want to scrape Collecting the data from URLs we have gathered Saving the data to the CSV file Iterating over rows and counting the occurrences of ingredients listed in the 'TopNotes' column Cleaning the index values and aggregating the counts of ingredients

Importing images from a directory (Python) to list or dictionary

from PIL import Image
import glob
image_list = []
for filename in glob.glob('yourpath/*.gif'): #assuming gif
    im=Image.open(filename)
    image_list.append(im)

python - Find out the percentage of missing values in each column in the given dataset - Stack Overflow

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

#python #python #loops #whileloop

Print the name of 7 days in a week - by using while loop

days = 0
week = [‘Monday’, ‘Tuesday’, ‘Wednesday’, ‘Thursday’, ‘Friday’, ‘Saturday’, 3.‘Sunday’]
while day < 7:
print(“Today is” + week[days])
days += 1

Getting the index of an item in a list containing it in Python

>>> ["foo", "bar", "baz"].index("bar")
1

#javascript #python #search #historicalcode #google #algorithms

Google’s PageRank Algorithm from 1996 - the origin of internet search

import numpy as np

def pagerank(M, num_iterations=100, d=0.85):
    N = M.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    iteration = 0
    while iteration < num_iterations:
        iteration += 1
        v = d * np.matmul(M, v) + (1 - d) / N
    return v

#python #python #strings #vowels #function

Get vowels in strings

This method gets vowels (‘a’, ‘e’, ‘i’, ‘o’, ‘u’) found in a string.
   
#make a function:
def get_vowels(string):

#return is the keyword which means function have to return value: 
 return [each for each in string if each in 'aeiou']


#assign the words and function will return vowels words.
get_vowels('foobar') # ['o', 'o', 'a']


get_vowels('gym') # []

Could not build wheels for tokenizers which use PEP 517 and cannot be installed directly

https://github.com/pydata/bottleneck/issues/281

How To Bypass Cloudflare Bot Protection In Selenium - CodingTutz

options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(options=options)

Python Loop through Excel sheets, place into one df - Stack Overflow

import pandas as pd

sheets_dict = pd.read_excel('Book1.xlsx', sheetname=None)

full_table = pd.DataFrame()
for name, sheet in sheets_dict.items():
    sheet['sheet'] = name
    sheet = sheet.rename(columns=lambda x: x.split('\n')[-1])
    full_table = full_table.append(sheet)

full_table.reset_index(inplace=True, drop=True)

print full_table

#python #dates #functions #python3.8

How to parse a String into Datetime in Python

from datetime import datetime

datetime_object = datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')

python - Way to change Google Chrome user agent in Selenium? - Stack Overflow

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

options = Options()
ua = UserAgent()
userAgent = ua.random
print(userAgent)
options.add_argument(f'user-agent={userAgent}')
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')
driver.get("https://www.google.co.in")
driver.quit()

python - How to see the progress bar of read_csv - Stack Overflow

def read_csv_pgbar(csv_path, chunksize, usecols, dtype=object):


    # print('Getting row count of csv file')

    rows = sum(1 for _ in open(csv_path, 'r')) - 1 # minus the header
    # chunks = rows//chunksize + 1
    # print('Reading csv file')
    chunk_list = []

    with tqdm(total=rows, desc='Rows read: ') as bar:
        for chunk in pd.read_csv(csv_path, chunksize=chunksize, usecols=usecols, dtype=dtype):
            chunk_list.append(chunk)
            bar.update(len(chunk))

    df = pd.concat((f for f in chunk_list), axis=0)
    print('Finish reading csv file')

    return df

#python #python #lists #dictionary

Convert two lists into a dictionary

keys, values)) # {'a': 2, 'c': 4, 'b': 3}
 
 
#make a function: def is the keyword for the function:
def to_dictionary(keys, values):
 
 
#return is the keyword that tells program that function has to return value   
return dict(zip(keys, values))
 
  
 
# keys and values are the lists:
 
keys = ["a", "b", "c"]   
 
values = [2, 3, 4]

#python #interesting #arrays #sorting #interviewquestions

Sorting an array without changing position of negative numbers

# Python3 implementation of the approach 

# Function to sort the array such that 
# negative values do not get affected 
def sortArray(a, n): 

	# Store all non-negative values 
	ans=[] 
	for i in range(n): 
		if (a[i] >= 0): 
			ans.append(a[i]) 

	# Sort non-negative values 
	ans = sorted(ans) 

	j = 0
	for i in range(n): 

		# If current element is non-negative then 
		# update it such that all the 
		# non-negative values are sorted 
		if (a[i] >= 0): 
			a[i] = ans[j] 
			j += 1

	# Print the sorted array 
	for i in range(n): 
		print(a[i],end = " ") 


# Driver code 

arr = [2, -6, -3, 8, 4, 1] 

n = len(arr) 

sortArray(arr, n)

#python ##python #strings #comments

Create simple string along with variables

#assign a value to a variable:
types_of_people = 10 
# make a string using variable name:
X = f “there are {types_of_people} types of people.”

Output:
There are 10 types of people

Browse more snippets >>