from typing import Dict, List, Union, Optional, Any from pathlib import Path from enum import Enum import os from pydantic import BaseSettings, PyObject import yaml from soupjobs import SOUPJOBS, CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER TagAttrs = Dict[str, str] class OutputFormat(str, Enum): simple = 'simple' yaml = 'yaml' json = 'json' class Settings(BaseSettings): CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER) entry_urls: Union[str, List[str]] = [] regex_mode: bool = False max_depth: int = 0 max_pages: Optional[int] output_with_urls: bool = False output_format: OutputFormat = OutputFormat.simple # Target to extract: target_tag: Optional[str] target_text: Optional[str] target_attrs: TagAttrs = {} target_match_func: Optional[PyObject] target_required: bool = False target_limit: Optional[int] target_extract_text: bool = True target_transform: Optional[PyObject] # Link to next page: next_link_text: Optional[str] next_link_href: Optional[str] next_link_attrs: TagAttrs = {} next_link_match_func: Optional[PyObject] next_link_limit: Optional[int] next_link_random: bool = True def has_target_filters(self) -> bool: return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func]) class Config: validate_assignment = True env_prefix = SOUPJOBS + '_' env_file_encoding = 'utf-8' @classmethod def customise_sources(cls, init_settings, env_settings, file_secret_settings): return init_settings, env_settings, file_secret_settings, yaml_settings def yaml_settings(settings_obj: Settings) -> Dict[str, Any]: try: with open(settings_obj.CONFIG_FILE, 'r') as f: config = yaml.safe_load(f) except FileNotFoundError: return {} for section_name in ('target', 'next_link'): section = config.pop(section_name, {}) for key, value in section.items(): config.setdefault(f'{section_name}_{key}', value) return config settings = Settings()