grab report details

PHOTO

Sun Jul 21 2024 12:50:18 GMT+0000 (Coordinated Universal Time)

Saved by @sty003 #python

def find_type_of_research(soup):
    if soup.find_all('lbj-title', attrs = {'variant':'eyebrow'}):
        return soup.find_all('lbj-title', attrs = {'variant':'eyebrow'})[0].text.strip()
    else:
        return np.nan

def find_title(soup):
    if soup.find_all('lbj-title', attrs = {'variant':'heading-1'}):
        return soup.find_all('lbj-title', attrs = {'variant':'heading-1'})[0].text.strip()
    if soup.find_all('h1', attrs = {'class':'title heading-1'}):
        return soup.find_all('h1', attrs = {'class':'title heading-1'})[0].text.strip()
    return np.nan

def find_subtitle(soup):
    if soup.find_all('lbj-title', attrs = {'variant':'subtitle'}):
        return soup.find_all('lbj-title', attrs = {'variant':'subtitle'})[0].text.strip()

def find_authors(soup):
    if soup.find_all('lbj-title', attrs = {'variant':'paragraph'}):
        author_code = soup.find_all('lbj-title', attrs = {'variant':'paragraph'})[0]
        # find every lbj-link
        authors = author_code.find_all('lbj-link')
        authors = [author.text.strip() for author in authors]
        authors_str =  ', '.join(authors)
        return authors_str
    return np.nan

def find_date(soup):
    if soup.find_all('lbj-title', attrs = {'variant':'date'}):
        date_code = soup.find_all('lbj-title', attrs = {'variant':'date'})[0]
        date = date_code.find_all('time')[0].text.strip()
        return date
    return np.nan

def find_report_link(soup):
    if soup.find_all('lbj-button'):
        return soup.find_all('lbj-button')[0].get('href')
    return np.nan

def find_tags(soup):
    if soup.find_all('lbj-link', attrs = {'variant':'tag'}):
        tags = soup.find_all('lbj-link', attrs = {'variant':'tag'})
        print(tags)

def find_data_json(soup):
    # get the javascript code
    script = soup.find_all('script', attrs = {'type':'text/javascript'})

    # found the json after dataLayer_tags
    pattern = 'dataLayer_tags = (.+);'
    for s in script:
        if re.search(pattern, str(s)):
            info_json = re.search(pattern, str(s)).group(1)
            # transform it into a dictionary
            info = eval(info_json)['urban_page']
            publish_date = info.get('publish_date')
            title = info.get('urban_title')
            research_area = info.get('research_area')
            authors = info.get('authors')
            publication_type = info.get('publication_type')
            eyebrow = info.get('eyebrow')
            tags = info.get('tags')
            
            return publish_date, title, research_area, authors, publication_type, eyebrow, tags, info_json
    return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan

df_links = pd.read_csv('urban_links.csv')
df_links_details = df_links.copy()

for i in range(len(df_links)):
    URL = "https://www.urban.org/" + df_links['link'][i]
    print(URL)
    r = requests.get(URL)
    soup = BeautifulSoup(r.content, 'html5lib')
    report_link = find_report_link(soup)
    publish_date, title, research_area, authors, publication_type, eyebrow, tags, info_json = find_data_json(soup)
    
    df_links_details.loc[i, 'eyebrow'] = eyebrow
    df_links_details.loc[i, 'title'] = title
    df_links_details.loc[i, 'authors'] = authors
    df_links_details.loc[i, 'date'] = publish_date
    df_links_details.loc[i, 'research_area'] = research_area
    df_links_details.loc[i, 'publication_type'] = publication_type
    df_links_details.loc[i, 'tags'] = tags
    df_links_details.loc[i, 'info_json'] = info_json
    df_links_details.loc[i, 'report_link'] = report_link
    
    print(publish_date, title, research_area, authors, publication_type, eyebrow, tags, report_link)
    
    if i % 200 == 0:
        df_links_details.to_csv('urban_links_details.csv', index = False)

Save snippets that work from anywhere online with our extensions

Available in the Chrome Web Store

Get Firefox Add-on

Get VS Code extension

Comments

More like this

Importing images from a directory (Python) to list or dictionary

from PIL import Image
import glob
image_list = []
for filename in glob.glob('yourpath/*.gif'): #assuming gif
    im=Image.open(filename)
    image_list.append(im)

python - Find out the percentage of missing values in each column in the given dataset - Stack Overflow

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

#python #python #loops #whileloop

Print the name of 7 days in a week - by using while loop

days = 0
week = [‘Monday’, ‘Tuesday’, ‘Wednesday’, ‘Thursday’, ‘Friday’, ‘Saturday’, 3.‘Sunday’]
while day < 7:
print(“Today is” + week[days])
days += 1

Getting the index of an item in a list containing it in Python

>>> ["foo", "bar", "baz"].index("bar")
1

#javascript #python #search #historicalcode #google #algorithms

Google’s PageRank Algorithm from 1996 - the origin of internet search

import numpy as np

def pagerank(M, num_iterations=100, d=0.85):
    N = M.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    iteration = 0
    while iteration < num_iterations:
        iteration += 1
        v = d * np.matmul(M, v) + (1 - d) / N
    return v

#python #python #strings #vowels #function

Get vowels in strings

This method gets vowels (‘a’, ‘e’, ‘i’, ‘o’, ‘u’) found in a string.
   
#make a function:
def get_vowels(string):

#return is the keyword which means function have to return value: 
 return [each for each in string if each in 'aeiou']


#assign the words and function will return vowels words.
get_vowels('foobar') # ['o', 'o', 'a']


get_vowels('gym') # []

Could not build wheels for tokenizers which use PEP 517 and cannot be installed directly

https://github.com/pydata/bottleneck/issues/281

How To Bypass Cloudflare Bot Protection In Selenium - CodingTutz

options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(options=options)

Python Loop through Excel sheets, place into one df - Stack Overflow

import pandas as pd

sheets_dict = pd.read_excel('Book1.xlsx', sheetname=None)

full_table = pd.DataFrame()
for name, sheet in sheets_dict.items():
    sheet['sheet'] = name
    sheet = sheet.rename(columns=lambda x: x.split('\n')[-1])
    full_table = full_table.append(sheet)

full_table.reset_index(inplace=True, drop=True)

print full_table

#python #dates #functions #python3.8

How to parse a String into Datetime in Python

from datetime import datetime

datetime_object = datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')

python - Way to change Google Chrome user agent in Selenium? - Stack Overflow

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

options = Options()
ua = UserAgent()
userAgent = ua.random
print(userAgent)
options.add_argument(f'user-agent={userAgent}')
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')
driver.get("https://www.google.co.in")
driver.quit()

python - How to see the progress bar of read_csv - Stack Overflow

def read_csv_pgbar(csv_path, chunksize, usecols, dtype=object):


    # print('Getting row count of csv file')

    rows = sum(1 for _ in open(csv_path, 'r')) - 1 # minus the header
    # chunks = rows//chunksize + 1
    # print('Reading csv file')
    chunk_list = []

    with tqdm(total=rows, desc='Rows read: ') as bar:
        for chunk in pd.read_csv(csv_path, chunksize=chunksize, usecols=usecols, dtype=dtype):
            chunk_list.append(chunk)
            bar.update(len(chunk))

    df = pd.concat((f for f in chunk_list), axis=0)
    print('Finish reading csv file')

    return df

#python #python #lists #dictionary

Convert two lists into a dictionary

keys, values)) # {'a': 2, 'c': 4, 'b': 3}
 
 
#make a function: def is the keyword for the function:
def to_dictionary(keys, values):
 
 
#return is the keyword that tells program that function has to return value   
return dict(zip(keys, values))
 
  
 
# keys and values are the lists:
 
keys = ["a", "b", "c"]   
 
values = [2, 3, 4]

#python #interesting #arrays #sorting #interviewquestions

Sorting an array without changing position of negative numbers

# Python3 implementation of the approach 

# Function to sort the array such that 
# negative values do not get affected 
def sortArray(a, n): 

	# Store all non-negative values 
	ans=[] 
	for i in range(n): 
		if (a[i] >= 0): 
			ans.append(a[i]) 

	# Sort non-negative values 
	ans = sorted(ans) 

	j = 0
	for i in range(n): 

		# If current element is non-negative then 
		# update it such that all the 
		# non-negative values are sorted 
		if (a[i] >= 0): 
			a[i] = ans[j] 
			j += 1

	# Print the sorted array 
	for i in range(n): 
		print(a[i],end = " ") 


# Driver code 

arr = [2, -6, -3, 8, 4, 1] 

n = len(arr) 

sortArray(arr, n)

#python ##python #strings #comments

Create simple string along with variables

#assign a value to a variable:
types_of_people = 10 
# make a string using variable name:
X = f “there are {types_of_people} types of people.”

Output:
There are 10 types of people

Browse more snippets >>