soupjobs/src/soupjobs/settings.py

80 lines
2.2 KiB
Python

from typing import Dict, List, Union, Optional, Any
from pathlib import Path
from enum import Enum
import os
from pydantic import BaseSettings, PyObject
import yaml
from soupjobs import SOUPJOBS, CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
StrListOrStr = Union[str, List[str]]
TagAttrs = Dict[str, StrListOrStr]
OptInt = Optional[int]
OptStr = Optional[str]
OptPyObj = Optional[PyObject]
class OutputFormat(str, Enum):
simple = 'simple'
yaml = 'yaml'
json = 'json'
class Settings(BaseSettings):
_CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
entry_urls: StrListOrStr = [] # test
regex_mode: bool = False
max_depth: int = 0
max_pages: OptInt
output_with_urls: bool = False
output_format: OutputFormat = OutputFormat.simple
# Target to extract:
target_tag: OptStr
target_text: OptStr
target_attrs: TagAttrs = {}
target_match_func: OptPyObj
target_required: bool = False
target_limit: OptInt
target_extract_text: bool = True
target_transform: OptPyObj
# Link to next page:
next_link_text: OptStr
next_link_href: OptStr
next_link_attrs: TagAttrs = {}
next_link_match_func: OptPyObj
next_link_limit: OptInt
next_link_random: bool = True
def has_target_filters(self) -> bool:
return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
class Config:
validate_assignment = True
env_prefix = SOUPJOBS + '_'
env_file_encoding = 'utf-8'
@classmethod
def customise_sources(cls, init_settings, env_settings, file_secret_settings):
return init_settings, env_settings, file_secret_settings, yaml_settings
def yaml_settings(settings_obj: Settings) -> Dict[str, Any]:
try:
with open(getattr(settings_obj, '_CONFIG_FILE'), 'r') as f:
config = yaml.safe_load(f)
except FileNotFoundError:
return {}
for section_name in ('target', 'next_link'):
section = config.pop(section_name, {})
for key, value in section.items():
config.setdefault(f'{section_name}_{key}', value)
return config
settings = Settings()