diff --git a/setup.cfg b/setup.cfg index f6dcff5..89c26d9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,4 +34,4 @@ tests = where = src [options.package_data] -soupjobs = example.config.yaml +soupjobs = default.config.yaml diff --git a/src/soupjobs/__init__.py b/src/soupjobs/__init__.py index 9c9ee07..e80b3c8 100644 --- a/src/soupjobs/__init__.py +++ b/src/soupjobs/__init__.py @@ -1,2 +1,4 @@ +SOUPJOBS = 'soupjobs' CONFIG_FILE_ENV_VAR = 'SOUPJOBS_CONFIG' CONFIG_FILE_PLACEHOLDER = 'placeholder' +CONFIG_FILE_DEFAULT_NAME = 'config.yaml' diff --git a/src/soupjobs/example.config.yaml b/src/soupjobs/default.config.yaml similarity index 89% rename from src/soupjobs/example.config.yaml rename to src/soupjobs/default.config.yaml index deb3774..1ebc033 100644 --- a/src/soupjobs/example.config.yaml +++ b/src/soupjobs/default.config.yaml @@ -11,21 +11,15 @@ # if specified as a string that has valid url format, the corresponding page is visited and scraped; # otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped. entry_urls: - - https://en.wikipedia.org/wiki/Python_(programming_language) - - https://en.wikipedia.org/wiki/Guido_van_Rossum # If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as # regular expressions to match the attribute values, otherwise they are simply checked for string equality. -# Default: -#regex_mode: False -regex_mode: True +regex_mode: False # The maximum recursion depth for following matching links to other pages starting from the `entry_urls`. # For example, a `max_depth` of 2 means that for every entry page the matching links may be followed, # and then for every one of those, the matching links may be followed again, but then the job would end. -# Default: -#max_depth: 0 -max_depth: 2 +max_depth: 0 # Maximum number of pages to visit. # Example: @@ -33,14 +27,12 @@ max_depth: 2 # If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values; # otherwise the output is merely a list of all extracted matches. -# Default: -#output_with_urls: False +output_with_urls: False # Output can be produced in either 'yaml', 'json' or 'simple' format. # The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and # add the url of the page they were extracted from in an additional line before them if that setting is `True`. -# Default: -#output_format: simple +output_format: simple # Target section @@ -51,16 +43,16 @@ max_depth: 2 target: # Filter by HTML tag # Example to only look for

tags: - tag: h1 + #tag: h1 # Filter by text inside the tag # Example: #text: program # Filter by any valid HTML attributes - attrs: - # Examples: - id: firstHeading + # Example: + #attrs: + #id: firstHeading #class: foo #role: bar @@ -70,18 +62,16 @@ target: #match_func: module.function # If this is set to `True` and no matching tags are found on a page, an exception is raised. - # Default: - #required: False + required: False # Stop doing requests as soon as possible when this number of matches were extracted. # Note that setting this parameter will restrict the number of returned targets to no more than is set here, # but in asynchronous execution the total number of requests made and targets scraped may be higher. # Example: - limit: 20 + #limit: 20 # If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted. - # Default: - #extract_text: True + extract_text: True # Optional transformation function to apply to every matching target tag. # Should take a BS4 Tag object as its sole argument and return a string. @@ -102,7 +92,7 @@ next_link: # Filter by the `href` attribute of the anchor tag. # Example: - href: '^/wiki/\w+' + #href: '^/wiki/\w+' # Filter by any other valid HTML attributes # Example: @@ -116,9 +106,8 @@ next_link: # Get at most this many links to other pages from one page. # Example: - limit: 10 + #limit: 10 # If `True`, and a limit is set that is below the number of matches on one page, # the links are chosen at random. Otherwise the first `limit` number are chosen. - # Default: - #random: True + random: True diff --git a/src/soupjobs/run.py b/src/soupjobs/run.py index ebc9100..33d27ed 100644 --- a/src/soupjobs/run.py +++ b/src/soupjobs/run.py @@ -4,7 +4,7 @@ from distutils.util import strtobool import asyncio import os -from soupjobs import CONFIG_FILE_ENV_VAR +from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_DEFAULT_NAME if __name__ == '__main__': @@ -17,8 +17,8 @@ if __name__ == '__main__': parser.add_argument( '-c', '--config-file', type=Path, - default=Path(Path(__file__).parent, 'config.yaml'), - help="Specify a different config file. Defaults to 'config.yaml' in the same directory as this run script." + default=Path(Path.cwd(), CONFIG_FILE_DEFAULT_NAME), + help="Specify a different config file path. Defaults to 'config.yaml' in the current working directory." ) parser.add_argument( 'entry_urls', @@ -64,7 +64,7 @@ if __name__ == '__main__': if not settings.has_target_filters(): warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \ "Are you sure you want to proceed? (y/n)" - proceed = strtobool(input(warning)) + proceed = strtobool(input(warning).lower()) if not proceed: print("Cancelled") exit() diff --git a/src/soupjobs/scrape.py b/src/soupjobs/scrape.py index 4a1d705..b6c1177 100644 --- a/src/soupjobs/scrape.py +++ b/src/soupjobs/scrape.py @@ -10,7 +10,7 @@ from bs4 import BeautifulSoup from bs4.element import Tag import yaml -from soupjobs.settings import settings, OutputFormat, TagAttrs +from .settings import settings, OutputFormat, TagAttrs BS_PARSER = 'html.parser' diff --git a/src/soupjobs/settings.py b/src/soupjobs/settings.py index 8699fe6..296bc5d 100644 --- a/src/soupjobs/settings.py +++ b/src/soupjobs/settings.py @@ -6,7 +6,7 @@ import os from pydantic import BaseSettings, PyObject import yaml -from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER +from soupjobs import SOUPJOBS, CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER TagAttrs = Dict[str, str] @@ -19,7 +19,7 @@ class OutputFormat(str, Enum): class Settings(BaseSettings): - _CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER) + CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER) entry_urls: Union[str, List[str]] = [] regex_mode: bool = False @@ -49,26 +49,27 @@ class Settings(BaseSettings): def has_target_filters(self) -> bool: return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func]) - @staticmethod - def yaml_settings(_settings_obj: BaseSettings) -> Dict[str, Any]: - try: - with open(Settings._CONFIG_FILE, 'r') as f: - config = yaml.safe_load(f) - except FileNotFoundError: - return {} - for section_name in ('target', 'next_link'): - section = config.pop(section_name, {}) - for key, value in section.items(): - config.setdefault(f'{section_name}_{key}', value) - return config - class Config: validate_assignment = True + env_prefix = SOUPJOBS + '_' env_file_encoding = 'utf-8' @classmethod def customise_sources(cls, init_settings, env_settings, file_secret_settings): - return init_settings, env_settings, file_secret_settings, Settings.yaml_settings + return init_settings, env_settings, file_secret_settings, yaml_settings + + +def yaml_settings(settings_obj: Settings) -> Dict[str, Any]: + try: + with open(settings_obj.CONFIG_FILE, 'r') as f: + config = yaml.safe_load(f) + except FileNotFoundError: + return {} + for section_name in ('target', 'next_link'): + section = config.pop(section_name, {}) + for key, value in section.items(): + config.setdefault(f'{section_name}_{key}', value) + return config settings = Settings()