80 lines
2.2 KiB
Python
80 lines
2.2 KiB
Python
from typing import Dict, List, Union, Optional, Any
|
|
from pathlib import Path
|
|
from enum import Enum
|
|
import os
|
|
|
|
from pydantic import BaseSettings, PyObject
|
|
import yaml
|
|
|
|
from soupjobs import SOUPJOBS, CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
|
|
|
|
|
|
StrListOrStr = Union[str, List[str]]
|
|
TagAttrs = Dict[str, StrListOrStr]
|
|
OptInt = Optional[int]
|
|
OptStr = Optional[str]
|
|
OptPyObj = Optional[PyObject]
|
|
|
|
|
|
class OutputFormat(str, Enum):
|
|
simple = 'simple'
|
|
yaml = 'yaml'
|
|
json = 'json'
|
|
|
|
|
|
class Settings(BaseSettings):
|
|
_CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
|
|
|
|
entry_urls: StrListOrStr = [] # test
|
|
regex_mode: bool = False
|
|
max_depth: int = 0
|
|
max_pages: OptInt
|
|
output_with_urls: bool = False
|
|
output_format: OutputFormat = OutputFormat.simple
|
|
|
|
# Target to extract:
|
|
target_tag: OptStr
|
|
target_text: OptStr
|
|
target_attrs: TagAttrs = {}
|
|
target_match_func: OptPyObj
|
|
target_required: bool = False
|
|
target_limit: OptInt
|
|
target_extract_text: bool = True
|
|
target_transform: OptPyObj
|
|
|
|
# Link to next page:
|
|
next_link_text: OptStr
|
|
next_link_href: OptStr
|
|
next_link_attrs: TagAttrs = {}
|
|
next_link_match_func: OptPyObj
|
|
next_link_limit: OptInt
|
|
next_link_random: bool = True
|
|
|
|
def has_target_filters(self) -> bool:
|
|
return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
|
|
|
|
class Config:
|
|
validate_assignment = True
|
|
env_prefix = SOUPJOBS + '_'
|
|
env_file_encoding = 'utf-8'
|
|
|
|
@classmethod
|
|
def customise_sources(cls, init_settings, env_settings, file_secret_settings):
|
|
return init_settings, env_settings, file_secret_settings, yaml_settings
|
|
|
|
|
|
def yaml_settings(settings_obj: Settings) -> Dict[str, Any]:
|
|
try:
|
|
with open(getattr(settings_obj, '_CONFIG_FILE'), 'r') as f:
|
|
config = yaml.safe_load(f)
|
|
except FileNotFoundError:
|
|
return {}
|
|
for section_name in ('target', 'next_link'):
|
|
section = config.pop(section_name, {})
|
|
for key, value in section.items():
|
|
config.setdefault(f'{section_name}_{key}', value)
|
|
return config
|
|
|
|
|
|
settings = Settings()
|