0 points

python - When using BeautifulSoup 4's `find_all` with a regex, how do I access regex match capture groups? - Stack Overflow


PHOTO EMBED

Fri Dec 25 2020 11:54:13 GMT+0000 (UTC)

Saved by @AboSari #python

from bs4 import BeautifulSoup, Tag
import re

data = '''<div>
<a href="link_1">Link 1</a>
<a href="link_2">Link 1</a>
<a href="link_XXX">Link 1</a>
<a href="link_3">Link 1</a>
</div>'''

soup = BeautifulSoup(data, 'lxml')

class my_regex_searcher:
    def __init__(self, regex_string):
        self.__r = re.compile(regex_string)
        self.groups = []

    def __call__(self, what):
        if isinstance(what, Tag):
            what = what.name

        if what:
            g = self.__r.findall(what)
            if g:
                self.groups.append(g)
                return True
        return False

    def __iter__(self):
        yield from self.groups

searcher = my_regex_searcher(r'link_(\d+)')
for l, groups in zip(soup.find_all(href=searcher), searcher):
    print(l)
    print(groups)

searcher = my_regex_searcher(r'(d)(i)(v)')
for l, groups in zip(soup.find_all(searcher), searcher):
    print(l.prettify())
    print(groups)
content_copyCOPY

https://stackoverflow.com/questions/33192381/when-using-beautifulsoup-4s-find-all-with-a-regex-how-do-i-access-regex-matc