diff --git a/src/soupjobs/default.config.yaml b/src/soupjobs/default.config.yaml index 1ebc033..c8d8064 100644 --- a/src/soupjobs/default.config.yaml +++ b/src/soupjobs/default.config.yaml @@ -74,7 +74,8 @@ target: extract_text: True # Optional transformation function to apply to every matching target tag. - # Should take a BS4 Tag object as its sole argument and return a string. + # If `extract_text` is `True`, it should take a string as its sole argument and return a string, + # otherwise it should take a BS4 Tag object as its sole argument and return a string. # Example: #transform: module.function diff --git a/src/soupjobs/scrape.py b/src/soupjobs/scrape.py index b6c1177..a0c11b5 100644 --- a/src/soupjobs/scrape.py +++ b/src/soupjobs/scrape.py @@ -92,10 +92,10 @@ def scrape_document(html: str) -> Tuple[List[Any], List[str]]: # Targets: targets = [] for tag in soup.find_all(target_filter, limit=settings.target_limit): - target = tag.text if settings.target_extract_text else str(tag) - if settings.target_transform: - target = settings.target_transform(target) - targets.append(target) + if settings.target_extract_text: + targets.append(settings.target_transform(tag.text) if settings.target_transform else tag.text) + else: + targets.append(settings.target_transform(tag) if settings.target_transform else str(tag)) # Next links: if settings.next_link_random: links = soup.find_all(link_filter) @@ -150,10 +150,13 @@ def tag_filter(tag: Tag, name: str = None, text: str = None, attrs: TagAttrs = N return False for attr_name, attr_value in attrs.items(): try: - if not string_matches(tag[attr_name], attr_value, regex): - return False + values = tag[attr_name] except KeyError: return False + if not isinstance(values, list): + values = [values] + if not any(string_matches(value, attr_value, regex) for value in values): + return False if func: return func(tag) return True