def find_type_of_research(soup):
if soup.find_all('lbj-title', attrs = {'variant':'eyebrow'}):
return soup.find_all('lbj-title', attrs = {'variant':'eyebrow'})[0].text.strip()
else:
return np.nan
def find_title(soup):
if soup.find_all('lbj-title', attrs = {'variant':'heading-1'}):
return soup.find_all('lbj-title', attrs = {'variant':'heading-1'})[0].text.strip()
if soup.find_all('h1', attrs = {'class':'title heading-1'}):
return soup.find_all('h1', attrs = {'class':'title heading-1'})[0].text.strip()
return np.nan
def find_subtitle(soup):
if soup.find_all('lbj-title', attrs = {'variant':'subtitle'}):
return soup.find_all('lbj-title', attrs = {'variant':'subtitle'})[0].text.strip()
def find_authors(soup):
if soup.find_all('lbj-title', attrs = {'variant':'paragraph'}):
author_code = soup.find_all('lbj-title', attrs = {'variant':'paragraph'})[0]
# find every lbj-link
authors = author_code.find_all('lbj-link')
authors = [author.text.strip() for author in authors]
authors_str = ', '.join(authors)
return authors_str
return np.nan
def find_date(soup):
if soup.find_all('lbj-title', attrs = {'variant':'date'}):
date_code = soup.find_all('lbj-title', attrs = {'variant':'date'})[0]
date = date_code.find_all('time')[0].text.strip()
return date
return np.nan
def find_report_link(soup):
if soup.find_all('lbj-button'):
return soup.find_all('lbj-button')[0].get('href')
return np.nan
def find_tags(soup):
if soup.find_all('lbj-link', attrs = {'variant':'tag'}):
tags = soup.find_all('lbj-link', attrs = {'variant':'tag'})
print(tags)
def find_data_json(soup):
# get the javascript code
script = soup.find_all('script', attrs = {'type':'text/javascript'})
# found the json after dataLayer_tags
pattern = 'dataLayer_tags = (.+);'
for s in script:
if re.search(pattern, str(s)):
info_json = re.search(pattern, str(s)).group(1)
# transform it into a dictionary
info = eval(info_json)['urban_page']
publish_date = info.get('publish_date')
title = info.get('urban_title')
research_area = info.get('research_area')
authors = info.get('authors')
publication_type = info.get('publication_type')
eyebrow = info.get('eyebrow')
tags = info.get('tags')
return publish_date, title, research_area, authors, publication_type, eyebrow, tags, info_json
return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
df_links = pd.read_csv('urban_links.csv')
df_links_details = df_links.copy()
for i in range(len(df_links)):
URL = "https://www.urban.org/" + df_links['link'][i]
print(URL)
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html5lib')
report_link = find_report_link(soup)
publish_date, title, research_area, authors, publication_type, eyebrow, tags, info_json = find_data_json(soup)
df_links_details.loc[i, 'eyebrow'] = eyebrow
df_links_details.loc[i, 'title'] = title
df_links_details.loc[i, 'authors'] = authors
df_links_details.loc[i, 'date'] = publish_date
df_links_details.loc[i, 'research_area'] = research_area
df_links_details.loc[i, 'publication_type'] = publication_type
df_links_details.loc[i, 'tags'] = tags
df_links_details.loc[i, 'info_json'] = info_json
df_links_details.loc[i, 'report_link'] = report_link
print(publish_date, title, research_area, authors, publication_type, eyebrow, tags, report_link)
if i % 200 == 0:
df_links_details.to_csv('urban_links_details.csv', index = False)