from bs4 import BeautifulSoup, Tag import re data = '''<div> <a href="link_1">Link 1</a> <a href="link_2">Link 1</a> <a href="link_XXX">Link 1</a> <a href="link_3">Link 1</a> </div>''' soup = BeautifulSoup(data, 'lxml') class my_regex_searcher: def __init__(self, regex_string): self.__r = re.compile(regex_string) self.groups = [] def __call__(self, what): if isinstance(what, Tag): what = what.name if what: g = self.__r.findall(what) if g: self.groups.append(g) return True return False def __iter__(self): yield from self.groups searcher = my_regex_searcher(r'link_(\d+)') for l, groups in zip(soup.find_all(href=searcher), searcher): print(l) print(groups) searcher = my_regex_searcher(r'(d)(i)(v)') for l, groups in zip(soup.find_all(searcher), searcher): print(l.prettify()) print(groups)
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter