grab report details
Sun Jul 21 2024 12:50:18 GMT+0000 (Coordinated Universal Time)
def find_type_of_research(soup): if soup.find_all('lbj-title', attrs = {'variant':'eyebrow'}): return soup.find_all('lbj-title', attrs = {'variant':'eyebrow'})[0].text.strip() else: return np.nan def find_title(soup): if soup.find_all('lbj-title', attrs = {'variant':'heading-1'}): return soup.find_all('lbj-title', attrs = {'variant':'heading-1'})[0].text.strip() if soup.find_all('h1', attrs = {'class':'title heading-1'}): return soup.find_all('h1', attrs = {'class':'title heading-1'})[0].text.strip() return np.nan def find_subtitle(soup): if soup.find_all('lbj-title', attrs = {'variant':'subtitle'}): return soup.find_all('lbj-title', attrs = {'variant':'subtitle'})[0].text.strip() def find_authors(soup): if soup.find_all('lbj-title', attrs = {'variant':'paragraph'}): author_code = soup.find_all('lbj-title', attrs = {'variant':'paragraph'})[0] # find every lbj-link authors = author_code.find_all('lbj-link') authors = [author.text.strip() for author in authors] authors_str = ', '.join(authors) return authors_str return np.nan def find_date(soup): if soup.find_all('lbj-title', attrs = {'variant':'date'}): date_code = soup.find_all('lbj-title', attrs = {'variant':'date'})[0] date = date_code.find_all('time')[0].text.strip() return date return np.nan def find_report_link(soup): if soup.find_all('lbj-button'): return soup.find_all('lbj-button')[0].get('href') return np.nan def find_tags(soup): if soup.find_all('lbj-link', attrs = {'variant':'tag'}): tags = soup.find_all('lbj-link', attrs = {'variant':'tag'}) print(tags) def find_data_json(soup): # get the javascript code script = soup.find_all('script', attrs = {'type':'text/javascript'}) # found the json after dataLayer_tags pattern = 'dataLayer_tags = (.+);' for s in script: if re.search(pattern, str(s)): info_json = re.search(pattern, str(s)).group(1) # transform it into a dictionary info = eval(info_json)['urban_page'] publish_date = info.get('publish_date') title = info.get('urban_title') research_area = info.get('research_area') authors = info.get('authors') publication_type = info.get('publication_type') eyebrow = info.get('eyebrow') tags = info.get('tags') return publish_date, title, research_area, authors, publication_type, eyebrow, tags, info_json return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan df_links = pd.read_csv('urban_links.csv') df_links_details = df_links.copy() for i in range(len(df_links)): URL = "https://www.urban.org/" + df_links['link'][i] print(URL) r = requests.get(URL) soup = BeautifulSoup(r.content, 'html5lib') report_link = find_report_link(soup) publish_date, title, research_area, authors, publication_type, eyebrow, tags, info_json = find_data_json(soup) df_links_details.loc[i, 'eyebrow'] = eyebrow df_links_details.loc[i, 'title'] = title df_links_details.loc[i, 'authors'] = authors df_links_details.loc[i, 'date'] = publish_date df_links_details.loc[i, 'research_area'] = research_area df_links_details.loc[i, 'publication_type'] = publication_type df_links_details.loc[i, 'tags'] = tags df_links_details.loc[i, 'info_json'] = info_json df_links_details.loc[i, 'report_link'] = report_link print(publish_date, title, research_area, authors, publication_type, eyebrow, tags, report_link) if i % 200 == 0: df_links_details.to_csv('urban_links_details.csv', index = False)
Comments