bugfixes
This commit is contained in:
parent
3f039ae31b
commit
e92924e28a
@ -74,7 +74,8 @@ target:
|
|||||||
extract_text: True
|
extract_text: True
|
||||||
|
|
||||||
# Optional transformation function to apply to every matching target tag.
|
# Optional transformation function to apply to every matching target tag.
|
||||||
# Should take a BS4 Tag object as its sole argument and return a string.
|
# If `extract_text` is `True`, it should take a string as its sole argument and return a string,
|
||||||
|
# otherwise it should take a BS4 Tag object as its sole argument and return a string.
|
||||||
# Example:
|
# Example:
|
||||||
#transform: module.function
|
#transform: module.function
|
||||||
|
|
||||||
|
@ -92,10 +92,10 @@ def scrape_document(html: str) -> Tuple[List[Any], List[str]]:
|
|||||||
# Targets:
|
# Targets:
|
||||||
targets = []
|
targets = []
|
||||||
for tag in soup.find_all(target_filter, limit=settings.target_limit):
|
for tag in soup.find_all(target_filter, limit=settings.target_limit):
|
||||||
target = tag.text if settings.target_extract_text else str(tag)
|
if settings.target_extract_text:
|
||||||
if settings.target_transform:
|
targets.append(settings.target_transform(tag.text) if settings.target_transform else tag.text)
|
||||||
target = settings.target_transform(target)
|
else:
|
||||||
targets.append(target)
|
targets.append(settings.target_transform(tag) if settings.target_transform else str(tag))
|
||||||
# Next links:
|
# Next links:
|
||||||
if settings.next_link_random:
|
if settings.next_link_random:
|
||||||
links = soup.find_all(link_filter)
|
links = soup.find_all(link_filter)
|
||||||
@ -150,10 +150,13 @@ def tag_filter(tag: Tag, name: str = None, text: str = None, attrs: TagAttrs = N
|
|||||||
return False
|
return False
|
||||||
for attr_name, attr_value in attrs.items():
|
for attr_name, attr_value in attrs.items():
|
||||||
try:
|
try:
|
||||||
if not string_matches(tag[attr_name], attr_value, regex):
|
values = tag[attr_name]
|
||||||
return False
|
|
||||||
except KeyError:
|
except KeyError:
|
||||||
return False
|
return False
|
||||||
|
if not isinstance(values, list):
|
||||||
|
values = [values]
|
||||||
|
if not any(string_matches(value, attr_value, regex) for value in values):
|
||||||
|
return False
|
||||||
if func:
|
if func:
|
||||||
return func(tag)
|
return func(tag)
|
||||||
return True
|
return True
|
||||||
|
Loading…
Reference in New Issue
Block a user