soupjobs/src/soupjobs/settings.py

75 lines
2.1 KiB
Python

from typing import Dict, List, Union, Optional, Any
from pathlib import Path
from enum import Enum
import os
from pydantic import BaseSettings, PyObject
import yaml
from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
TagAttrs = Dict[str, str]
class OutputFormat(str, Enum):
simple = 'simple'
yaml = 'yaml'
json = 'json'
class Settings(BaseSettings):
_CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
entry_urls: Union[str, List[str]] = []
regex_mode: bool = False
max_depth: int = 0
max_pages: Optional[int]
output_with_urls: bool = False
output_format: OutputFormat = OutputFormat.simple
# Target to extract:
target_tag: Optional[str]
target_text: Optional[str]
target_attrs: TagAttrs = {}
target_match_func: Optional[PyObject]
target_required: bool = False
target_limit: Optional[int]
target_extract_text: bool = True
target_transform: Optional[PyObject]
# Link to next page:
next_link_text: Optional[str]
next_link_href: Optional[str]
next_link_attrs: TagAttrs = {}
next_link_match_func: Optional[PyObject]
next_link_limit: Optional[int]
next_link_random: bool = True
def has_target_filters(self) -> bool:
return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
@staticmethod
def yaml_settings(_settings_obj: BaseSettings) -> Dict[str, Any]:
try:
with open(Settings._CONFIG_FILE, 'r') as f:
config = yaml.safe_load(f)
except FileNotFoundError:
return {}
for section_name in ('target', 'next_link'):
section = config.pop(section_name, {})
for key, value in section.items():
config.setdefault(f'{section_name}_{key}', value)
return config
class Config:
validate_assignment = True
env_file_encoding = 'utf-8'
@classmethod
def customise_sources(cls, init_settings, env_settings, file_secret_settings):
return init_settings, env_settings, file_secret_settings, Settings.yaml_settings
settings = Settings()