from typing import Dict, List, Union, Optional, Any from pathlib import Path from enum import Enum import os from pydantic import BaseSettings, PyObject import yaml from soupjobs import SOUPJOBS, CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER StrListOrStr = Union[str, List[str]] TagAttrs = Dict[str, StrListOrStr] OptInt = Optional[int] OptStr = Optional[str] OptPyObj = Optional[PyObject] class OutputFormat(str, Enum): simple = 'simple' yaml = 'yaml' json = 'json' class Settings(BaseSettings): _CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER) entry_urls: StrListOrStr = [] # test regex_mode: bool = False max_depth: int = 0 max_pages: OptInt output_with_urls: bool = False output_format: OutputFormat = OutputFormat.simple # Target to extract: target_tag: OptStr target_text: OptStr target_attrs: TagAttrs = {} target_match_func: OptPyObj target_required: bool = False target_limit: OptInt target_extract_text: bool = True target_transform: OptPyObj # Link to next page: next_link_text: OptStr next_link_href: OptStr next_link_attrs: TagAttrs = {} next_link_match_func: OptPyObj next_link_limit: OptInt next_link_random: bool = True def has_target_filters(self) -> bool: return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func]) class Config: validate_assignment = True env_prefix = SOUPJOBS + '_' env_file_encoding = 'utf-8' @classmethod def customise_sources(cls, init_settings, env_settings, file_secret_settings): return init_settings, env_settings, file_secret_settings, yaml_settings def yaml_settings(settings_obj: Settings) -> Dict[str, Any]: try: with open(getattr(settings_obj, '_CONFIG_FILE'), 'r') as f: config = yaml.safe_load(f) except FileNotFoundError: return {} for section_name in ('target', 'next_link'): section = config.pop(section_name, {}) for key, value in section.items(): config.setdefault(f'{section_name}_{key}', value) return config settings = Settings()