From 6a4efbd16277e6e8572be609c4dac3b9aac164b8 Mon Sep 17 00:00:00 2001 From: Daniil Fajnberg Date: Sun, 25 Jul 2021 17:33:06 +0200 Subject: [PATCH] enabled multiple values for one tag attribute --- src/soupjobs/default.config.yaml | 4 +++- src/soupjobs/scrape.py | 9 ++++++--- src/soupjobs/settings.py | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/soupjobs/default.config.yaml b/src/soupjobs/default.config.yaml index c8d8064..c3ea5b3 100644 --- a/src/soupjobs/default.config.yaml +++ b/src/soupjobs/default.config.yaml @@ -53,7 +53,9 @@ target: # Example: #attrs: #id: firstHeading - #class: foo + #class: + # - foo + # - example #role: bar # Filter using a custom python function with the path specified in dot-notation. diff --git a/src/soupjobs/scrape.py b/src/soupjobs/scrape.py index a0c11b5..9f58271 100644 --- a/src/soupjobs/scrape.py +++ b/src/soupjobs/scrape.py @@ -148,15 +148,18 @@ def tag_filter(tag: Tag, name: str = None, text: str = None, attrs: TagAttrs = N return False if not string_matches(tag.text, text, regex): return False - for attr_name, attr_value in attrs.items(): + for attr_name, attr_values in attrs.items(): try: values = tag[attr_name] except KeyError: return False if not isinstance(values, list): values = [values] - if not any(string_matches(value, attr_value, regex) for value in values): - return False + if not isinstance(attr_values, list): + attr_values = [attr_values] + for attr_value in attr_values: + if not any(string_matches(value, attr_value, regex) for value in values): + return False if func: return func(tag) return True diff --git a/src/soupjobs/settings.py b/src/soupjobs/settings.py index ae943aa..d3d673f 100644 --- a/src/soupjobs/settings.py +++ b/src/soupjobs/settings.py @@ -9,8 +9,8 @@ import yaml from soupjobs import SOUPJOBS, CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER -TagAttrs = Dict[str, str] StrListOrStr = Union[str, List[str]] +TagAttrs = Dict[str, StrListOrStr] OptInt = Optional[int] OptStr = Optional[str] OptPyObj = Optional[PyObject]