soupjobs/src/soupjobs/settings.py

76 lines
2.1 KiB
Python
Raw Normal View History

2021-03-13 16:53:15 +01:00
from typing import Dict, List, Union, Optional, Any
from pathlib import Path
from enum import Enum
import os
from pydantic import BaseSettings, PyObject
import yaml
2021-07-25 14:45:07 +02:00
from soupjobs import SOUPJOBS, CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
2021-03-13 16:53:15 +01:00
TagAttrs = Dict[str, str]
class OutputFormat(str, Enum):
simple = 'simple'
yaml = 'yaml'
json = 'json'
class Settings(BaseSettings):
2021-07-25 14:46:27 +02:00
_CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
2021-03-13 16:53:15 +01:00
entry_urls: Union[str, List[str]] = []
regex_mode: bool = False
max_depth: int = 0
max_pages: Optional[int]
output_with_urls: bool = False
output_format: OutputFormat = OutputFormat.simple
# Target to extract:
target_tag: Optional[str]
target_text: Optional[str]
target_attrs: TagAttrs = {}
target_match_func: Optional[PyObject]
target_required: bool = False
target_limit: Optional[int]
target_extract_text: bool = True
target_transform: Optional[PyObject]
# Link to next page:
next_link_text: Optional[str]
next_link_href: Optional[str]
next_link_attrs: TagAttrs = {}
next_link_match_func: Optional[PyObject]
next_link_limit: Optional[int]
next_link_random: bool = True
def has_target_filters(self) -> bool:
return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
class Config:
validate_assignment = True
2021-07-25 14:45:07 +02:00
env_prefix = SOUPJOBS + '_'
2021-03-13 16:53:15 +01:00
env_file_encoding = 'utf-8'
@classmethod
def customise_sources(cls, init_settings, env_settings, file_secret_settings):
2021-07-25 14:45:07 +02:00
return init_settings, env_settings, file_secret_settings, yaml_settings
def yaml_settings(settings_obj: Settings) -> Dict[str, Any]:
try:
2021-07-25 14:46:27 +02:00
with open(getattr(settings_obj, '_CONFIG_FILE'), 'r') as f:
2021-07-25 14:45:07 +02:00
config = yaml.safe_load(f)
except FileNotFoundError:
return {}
for section_name in ('target', 'next_link'):
section = config.pop(section_name, {})
for key, value in section.items():
config.setdefault(f'{section_name}_{key}', value)
return config
2021-03-13 16:53:15 +01:00
settings = Settings()