This commit is contained in:
Daniil Fajnberg 2021-07-25 16:58:36 +02:00
parent 3f039ae31b
commit e92924e28a
2 changed files with 11 additions and 7 deletions

View File

@ -74,7 +74,8 @@ target:
extract_text: True
# Optional transformation function to apply to every matching target tag.
# Should take a BS4 Tag object as its sole argument and return a string.
# If `extract_text` is `True`, it should take a string as its sole argument and return a string,
# otherwise it should take a BS4 Tag object as its sole argument and return a string.
# Example:
#transform: module.function

View File

@ -92,10 +92,10 @@ def scrape_document(html: str) -> Tuple[List[Any], List[str]]:
# Targets:
targets = []
for tag in soup.find_all(target_filter, limit=settings.target_limit):
target = tag.text if settings.target_extract_text else str(tag)
if settings.target_transform:
target = settings.target_transform(target)
targets.append(target)
if settings.target_extract_text:
targets.append(settings.target_transform(tag.text) if settings.target_transform else tag.text)
else:
targets.append(settings.target_transform(tag) if settings.target_transform else str(tag))
# Next links:
if settings.next_link_random:
links = soup.find_all(link_filter)
@ -150,10 +150,13 @@ def tag_filter(tag: Tag, name: str = None, text: str = None, attrs: TagAttrs = N
return False
for attr_name, attr_value in attrs.items():
try:
if not string_matches(tag[attr_name], attr_value, regex):
return False
values = tag[attr_name]
except KeyError:
return False
if not isinstance(values, list):
values = [values]
if not any(string_matches(value, attr_value, regex) for value in values):
return False
if func:
return func(tag)
return True