2021-03-13 16:53:15 +01:00
|
|
|
from typing import Dict, List, Union, Optional, Any
|
|
|
|
from pathlib import Path
|
|
|
|
from enum import Enum
|
|
|
|
import os
|
|
|
|
|
|
|
|
from pydantic import BaseSettings, PyObject
|
|
|
|
import yaml
|
|
|
|
|
2021-07-25 14:45:07 +02:00
|
|
|
from soupjobs import SOUPJOBS, CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
|
2021-03-13 16:53:15 +01:00
|
|
|
|
|
|
|
|
|
|
|
TagAttrs = Dict[str, str]
|
|
|
|
|
|
|
|
|
|
|
|
class OutputFormat(str, Enum):
|
|
|
|
simple = 'simple'
|
|
|
|
yaml = 'yaml'
|
|
|
|
json = 'json'
|
|
|
|
|
|
|
|
|
|
|
|
class Settings(BaseSettings):
|
2021-07-25 14:46:27 +02:00
|
|
|
_CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
|
2021-03-13 16:53:15 +01:00
|
|
|
|
|
|
|
entry_urls: Union[str, List[str]] = []
|
|
|
|
regex_mode: bool = False
|
|
|
|
max_depth: int = 0
|
|
|
|
max_pages: Optional[int]
|
|
|
|
output_with_urls: bool = False
|
|
|
|
output_format: OutputFormat = OutputFormat.simple
|
|
|
|
|
|
|
|
# Target to extract:
|
|
|
|
target_tag: Optional[str]
|
|
|
|
target_text: Optional[str]
|
|
|
|
target_attrs: TagAttrs = {}
|
|
|
|
target_match_func: Optional[PyObject]
|
|
|
|
target_required: bool = False
|
|
|
|
target_limit: Optional[int]
|
|
|
|
target_extract_text: bool = True
|
|
|
|
target_transform: Optional[PyObject]
|
|
|
|
|
|
|
|
# Link to next page:
|
|
|
|
next_link_text: Optional[str]
|
|
|
|
next_link_href: Optional[str]
|
|
|
|
next_link_attrs: TagAttrs = {}
|
|
|
|
next_link_match_func: Optional[PyObject]
|
|
|
|
next_link_limit: Optional[int]
|
|
|
|
next_link_random: bool = True
|
|
|
|
|
|
|
|
def has_target_filters(self) -> bool:
|
|
|
|
return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
|
|
|
|
|
|
|
|
class Config:
|
|
|
|
validate_assignment = True
|
2021-07-25 14:45:07 +02:00
|
|
|
env_prefix = SOUPJOBS + '_'
|
2021-03-13 16:53:15 +01:00
|
|
|
env_file_encoding = 'utf-8'
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def customise_sources(cls, init_settings, env_settings, file_secret_settings):
|
2021-07-25 14:45:07 +02:00
|
|
|
return init_settings, env_settings, file_secret_settings, yaml_settings
|
|
|
|
|
|
|
|
|
|
|
|
def yaml_settings(settings_obj: Settings) -> Dict[str, Any]:
|
|
|
|
try:
|
2021-07-25 14:46:27 +02:00
|
|
|
with open(getattr(settings_obj, '_CONFIG_FILE'), 'r') as f:
|
2021-07-25 14:45:07 +02:00
|
|
|
config = yaml.safe_load(f)
|
|
|
|
except FileNotFoundError:
|
|
|
|
return {}
|
|
|
|
for section_name in ('target', 'next_link'):
|
|
|
|
section = config.pop(section_name, {})
|
|
|
|
for key, value in section.items():
|
|
|
|
config.setdefault(f'{section_name}_{key}', value)
|
|
|
|
return config
|
2021-03-13 16:53:15 +01:00
|
|
|
|
|
|
|
|
|
|
|
settings = Settings()
|