bugfixes
This commit is contained in:
parent
3f039ae31b
commit
e92924e28a
@ -74,7 +74,8 @@ target:
|
||||
extract_text: True
|
||||
|
||||
# Optional transformation function to apply to every matching target tag.
|
||||
# Should take a BS4 Tag object as its sole argument and return a string.
|
||||
# If `extract_text` is `True`, it should take a string as its sole argument and return a string,
|
||||
# otherwise it should take a BS4 Tag object as its sole argument and return a string.
|
||||
# Example:
|
||||
#transform: module.function
|
||||
|
||||
|
@ -92,10 +92,10 @@ def scrape_document(html: str) -> Tuple[List[Any], List[str]]:
|
||||
# Targets:
|
||||
targets = []
|
||||
for tag in soup.find_all(target_filter, limit=settings.target_limit):
|
||||
target = tag.text if settings.target_extract_text else str(tag)
|
||||
if settings.target_transform:
|
||||
target = settings.target_transform(target)
|
||||
targets.append(target)
|
||||
if settings.target_extract_text:
|
||||
targets.append(settings.target_transform(tag.text) if settings.target_transform else tag.text)
|
||||
else:
|
||||
targets.append(settings.target_transform(tag) if settings.target_transform else str(tag))
|
||||
# Next links:
|
||||
if settings.next_link_random:
|
||||
links = soup.find_all(link_filter)
|
||||
@ -150,10 +150,13 @@ def tag_filter(tag: Tag, name: str = None, text: str = None, attrs: TagAttrs = N
|
||||
return False
|
||||
for attr_name, attr_value in attrs.items():
|
||||
try:
|
||||
if not string_matches(tag[attr_name], attr_value, regex):
|
||||
return False
|
||||
values = tag[attr_name]
|
||||
except KeyError:
|
||||
return False
|
||||
if not isinstance(values, list):
|
||||
values = [values]
|
||||
if not any(string_matches(value, attr_value, regex) for value in values):
|
||||
return False
|
||||
if func:
|
||||
return func(tag)
|
||||
return True
|
||||
|
Loading…
Reference in New Issue
Block a user