BeautifulSoup simple scraping website

PHOTO EMBED

Sat Oct 29 2022 11:28:48 GMT+0000 (Coordinated Universal Time)

Saved by @janduplessis883 ##scrapping ##beautifulsoup

# YOUR CODE HERE
import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib

# TODO: Use `requests` to do an HTTP request to fetch data located at that URL

url = "http://books.toscrape.com/"
response = requests.get(url)

# TODO: Create a `BeautifulSoup` instance with that data
soup = BeautifulSoup(response.content, "html.parser")

books_html = soup.find_all("article", class_="product_pod")
len(books_html)

books_html[0]

book_title =books_html[0].find("h3").find("a").attrs['title']
print(book_title)

book_price = books_html[0].find("p", class_="price_color").string
print(book_price[1:])

book_stars_html = books_html[0].find("p", class_="star-rating")

def parse_rating(rating_classes):
    if 'One' in rating_classes:
        return 1
    elif 'Two' in rating_classes:
        return 2
    elif 'Three' in rating_classes:
        return 3
    elif 'Four' in rating_classes:
        return 4
    elif 'Five' in rating_classes:
        return 5
    else:
        return 0

parse_rating(book_stars_html.attrs['class'])

book_rating = parse_rating(books_html[0].find("p", class_="star-rating").attrs['class'])

books_dict = { 'Title': [], 'Price': [], 'Rating': [] }

for book in books_html:
    title = book.find("h3").find("a").attrs["title"]
    price = float(book.find("p", class_="price_color").text[1:])
    rating = parse_rating(book.find("p", class_="star-rating").attrs['class'])
    books_dict["Title"].append(title)
    books_dict["Price"].append(price)
    books_dict["Rating"].append(rating)

books_dict

len(books_dict)         # You should have 3 key:value pairs
len(books_dict["Title"]) # Each value should contain 20 elements from the 20 books, as many as on the web page!

books_df = pd.DataFrame.from_dict(books_dict)
books_df

pip install XlsxWriter

books_df.to_excel('books.xlsx', sheet_name='Books')
content_copyCOPY