grab report details

PHOTO EMBED

Sun Jul 21 2024 12:50:18 GMT+0000 (Coordinated Universal Time)

Saved by @sty003 #python

def find_type_of_research(soup):
    if soup.find_all('lbj-title', attrs = {'variant':'eyebrow'}):
        return soup.find_all('lbj-title', attrs = {'variant':'eyebrow'})[0].text.strip()
    else:
        return np.nan

def find_title(soup):
    if soup.find_all('lbj-title', attrs = {'variant':'heading-1'}):
        return soup.find_all('lbj-title', attrs = {'variant':'heading-1'})[0].text.strip()
    if soup.find_all('h1', attrs = {'class':'title heading-1'}):
        return soup.find_all('h1', attrs = {'class':'title heading-1'})[0].text.strip()
    return np.nan

def find_subtitle(soup):
    if soup.find_all('lbj-title', attrs = {'variant':'subtitle'}):
        return soup.find_all('lbj-title', attrs = {'variant':'subtitle'})[0].text.strip()

def find_authors(soup):
    if soup.find_all('lbj-title', attrs = {'variant':'paragraph'}):
        author_code = soup.find_all('lbj-title', attrs = {'variant':'paragraph'})[0]
        # find every lbj-link
        authors = author_code.find_all('lbj-link')
        authors = [author.text.strip() for author in authors]
        authors_str =  ', '.join(authors)
        return authors_str
    return np.nan

def find_date(soup):
    if soup.find_all('lbj-title', attrs = {'variant':'date'}):
        date_code = soup.find_all('lbj-title', attrs = {'variant':'date'})[0]
        date = date_code.find_all('time')[0].text.strip()
        return date
    return np.nan

def find_report_link(soup):
    if soup.find_all('lbj-button'):
        return soup.find_all('lbj-button')[0].get('href')
    return np.nan

def find_tags(soup):
    if soup.find_all('lbj-link', attrs = {'variant':'tag'}):
        tags = soup.find_all('lbj-link', attrs = {'variant':'tag'})
        print(tags)

def find_data_json(soup):
    # get the javascript code
    script = soup.find_all('script', attrs = {'type':'text/javascript'})

    # found the json after dataLayer_tags
    pattern = 'dataLayer_tags = (.+);'
    for s in script:
        if re.search(pattern, str(s)):
            info_json = re.search(pattern, str(s)).group(1)
            # transform it into a dictionary
            info = eval(info_json)['urban_page']
            publish_date = info.get('publish_date')
            title = info.get('urban_title')
            research_area = info.get('research_area')
            authors = info.get('authors')
            publication_type = info.get('publication_type')
            eyebrow = info.get('eyebrow')
            tags = info.get('tags')
            
            return publish_date, title, research_area, authors, publication_type, eyebrow, tags, info_json
    return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan

df_links = pd.read_csv('urban_links.csv')
df_links_details = df_links.copy()

for i in range(len(df_links)):
    URL = "https://www.urban.org/" + df_links['link'][i]
    print(URL)
    r = requests.get(URL)
    soup = BeautifulSoup(r.content, 'html5lib')
    report_link = find_report_link(soup)
    publish_date, title, research_area, authors, publication_type, eyebrow, tags, info_json = find_data_json(soup)
    
    df_links_details.loc[i, 'eyebrow'] = eyebrow
    df_links_details.loc[i, 'title'] = title
    df_links_details.loc[i, 'authors'] = authors
    df_links_details.loc[i, 'date'] = publish_date
    df_links_details.loc[i, 'research_area'] = research_area
    df_links_details.loc[i, 'publication_type'] = publication_type
    df_links_details.loc[i, 'tags'] = tags
    df_links_details.loc[i, 'info_json'] = info_json
    df_links_details.loc[i, 'report_link'] = report_link
    
    print(publish_date, title, research_area, authors, publication_type, eyebrow, tags, report_link)
    
    if i % 200 == 0:
        df_links_details.to_csv('urban_links_details.csv', index = False)
content_copyCOPY