urban institute scrape

PHOTO EMBED

Sun Jul 21 2024 12:08:54 GMT+0000 (Coordinated Universal Time)

Saved by @sty003 #python

df_links = []

PAGE_NUM = 281

for page in range(1, PAGE_NUM):
    URL = f"https://www.urban.org/research?page={page}"
    r = requests.get(URL) 
    soup = BeautifulSoup(r.content, 'html5lib')
    articles = soup.find_all('li', attrs = {'class':'mb-16 md:col-span-2 mb-8 sm:mb-0'})
    for article in articles:
        pattern = 'href="(.+?)"'
        if re.search(pattern, str(article)):
            df_links.append([page, re.search(pattern, str(article)).group(1)])
    print(f'Page {page} done')

df_links = pd.DataFrame(df_links, columns = ['page', 'link'])
df_links.to_csv('urban_links.csv', index = False)
content_copyCOPY