first working draft
This commit is contained in:
parent
f58ed9fee0
commit
65f8c7d4be
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,3 +1,4 @@
|
|||||||
|
config.yaml
|
||||||
# Python virtual environment:
|
# Python virtual environment:
|
||||||
/.venv/
|
/.venv/
|
||||||
# pipenv lock-file and cache directory:
|
# pipenv lock-file and cache directory:
|
||||||
|
2
Pipfile
2
Pipfile
@ -6,6 +6,8 @@ name = "pypi"
|
|||||||
[packages]
|
[packages]
|
||||||
beautifulsoup4 = "*"
|
beautifulsoup4 = "*"
|
||||||
aiohttp = "*"
|
aiohttp = "*"
|
||||||
|
pydantic = "*"
|
||||||
|
pyyaml = "*"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
|
|
||||||
|
5
setup.py
5
setup.py
@ -14,9 +14,12 @@ setuptools.setup(
|
|||||||
# url="https://github.com/...",
|
# url="https://github.com/...",
|
||||||
package_dir={'': 'src'},
|
package_dir={'': 'src'},
|
||||||
packages=setuptools.find_packages(where='src'),
|
packages=setuptools.find_packages(where='src'),
|
||||||
|
package_data={'soupjobs': ['example.config.yaml']},
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'aiohttp',
|
'aiohttp',
|
||||||
'beautifulsoup4'
|
'beautifulsoup4',
|
||||||
|
'pydantic',
|
||||||
|
'pyyaml',
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
'tests': ['coverage'],
|
'tests': ['coverage'],
|
||||||
|
2
src/soupjobs/__init__.py
Normal file
2
src/soupjobs/__init__.py
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
CONFIG_FILE_ENV_VAR = 'SOUPJOBS_CONFIG'
|
||||||
|
CONFIG_FILE_PLACEHOLDER = 'placeholder'
|
124
src/soupjobs/example.config.yaml
Normal file
124
src/soupjobs/example.config.yaml
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
############################
|
||||||
|
# Scrape job configuration #
|
||||||
|
############################
|
||||||
|
|
||||||
|
# All possible config parameters are explained here with either the default or an example value provided with them.
|
||||||
|
|
||||||
|
# General section
|
||||||
|
#################
|
||||||
|
|
||||||
|
# If specified as a list, the elements are interpreted as urls to visit and scrape;
|
||||||
|
# if specified as a string that has valid url format, the corresponding page is visited and scraped;
|
||||||
|
# otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped.
|
||||||
|
entry_urls:
|
||||||
|
- https://en.wikipedia.org/wiki/Python_(programming_language)
|
||||||
|
- https://en.wikipedia.org/wiki/Guido_van_Rossum
|
||||||
|
|
||||||
|
# If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as
|
||||||
|
# regular expressions to match the attribute values, otherwise they are simply checked for string equality.
|
||||||
|
# Default:
|
||||||
|
#regex_mode: False
|
||||||
|
regex_mode: True
|
||||||
|
|
||||||
|
# The maximum recursion depth for following matching links to other pages starting from the `entry_urls`.
|
||||||
|
# For example, a `max_depth` of 2 means that for every entry page the matching links may be followed,
|
||||||
|
# and then for every one of those, the matching links may be followed again, but then the job would end.
|
||||||
|
# Default:
|
||||||
|
#max_depth: 0
|
||||||
|
max_depth: 2
|
||||||
|
|
||||||
|
# Maximum number of pages to visit.
|
||||||
|
# Example:
|
||||||
|
#max_pages: 100
|
||||||
|
|
||||||
|
# If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values;
|
||||||
|
# otherwise the output is merely a list of all extracted matches.
|
||||||
|
# Default:
|
||||||
|
#output_with_urls: False
|
||||||
|
|
||||||
|
# Output can be produced in either 'yaml', 'json' or 'simple' format.
|
||||||
|
# The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and
|
||||||
|
# add the url of the page they were extracted from in an additional line before them if that setting is `True`.
|
||||||
|
# Default:
|
||||||
|
#output_format: simple
|
||||||
|
|
||||||
|
|
||||||
|
# Target section
|
||||||
|
# ##############
|
||||||
|
|
||||||
|
# The following section is used to specify filter criteria for the target HTML tags to extract from a page.
|
||||||
|
# Only HTML tags matching *all* of the specified filters will be extracted.
|
||||||
|
target:
|
||||||
|
# Filter by HTML tag
|
||||||
|
# Example to only look for <h1> tags:
|
||||||
|
tag: h1
|
||||||
|
|
||||||
|
# Filter by text inside the tag
|
||||||
|
# Example:
|
||||||
|
#text: program
|
||||||
|
|
||||||
|
# Filter by any valid HTML attributes
|
||||||
|
attrs:
|
||||||
|
# Examples:
|
||||||
|
id: firstHeading
|
||||||
|
#class: foo
|
||||||
|
#role: bar
|
||||||
|
|
||||||
|
# Filter using a custom python function with the path specified in dot-notation.
|
||||||
|
# The function should take a BS4 Tag object as its sole argument and return `True` if the tag matches.
|
||||||
|
# Example:
|
||||||
|
#match_func: module.function
|
||||||
|
|
||||||
|
# If this is set to `True` and no matching tags are found on a page, an exception is raised.
|
||||||
|
# Default:
|
||||||
|
#required: False
|
||||||
|
|
||||||
|
# Stop doing requests as soon as possible when this number of matches were extracted.
|
||||||
|
# Note that setting this parameter will restrict the number of returned targets to no more than is set here,
|
||||||
|
# but in asynchronous execution the total number of requests made and targets scraped may be higher.
|
||||||
|
# Example:
|
||||||
|
limit: 20
|
||||||
|
|
||||||
|
# If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted.
|
||||||
|
# Default:
|
||||||
|
#extract_text: True
|
||||||
|
|
||||||
|
# Optional transformation function to apply to every matching target tag.
|
||||||
|
# Should take a BS4 Tag object as its sole argument and return a string.
|
||||||
|
# Example:
|
||||||
|
#transform: module.function
|
||||||
|
|
||||||
|
|
||||||
|
# Links section
|
||||||
|
###############
|
||||||
|
|
||||||
|
# This section is used to specify filter criteria for links (<a> tags) to pick for recursive scraping.
|
||||||
|
# Only HTML tags matching *all* of the specified filters will be considered.
|
||||||
|
# The linked pages will be recursively scraped at most to the depth specified by the `max_depth` parameter.
|
||||||
|
next_link:
|
||||||
|
# Filter by text inside the anchor tag
|
||||||
|
# Example:
|
||||||
|
#text: Another important page
|
||||||
|
|
||||||
|
# Filter by the `href` attribute of the anchor tag.
|
||||||
|
# Example:
|
||||||
|
href: '^/wiki/\w+'
|
||||||
|
|
||||||
|
# Filter by any other valid HTML attributes
|
||||||
|
# Example:
|
||||||
|
#attrs:
|
||||||
|
#class: result
|
||||||
|
#rel: noopener
|
||||||
|
|
||||||
|
# Function filter; same as in the `target` section.
|
||||||
|
# Example:
|
||||||
|
#match_func: module.function
|
||||||
|
|
||||||
|
# Get at most this many links to other pages from one page.
|
||||||
|
# Example:
|
||||||
|
limit: 10
|
||||||
|
|
||||||
|
# If `True`, and a limit is set that is below the number of matches on one page,
|
||||||
|
# the links are chosen at random. Otherwise the first `limit` number are chosen.
|
||||||
|
# Default:
|
||||||
|
#random: True
|
71
src/soupjobs/run.py
Normal file
71
src/soupjobs/run.py
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
from argparse import ArgumentParser, SUPPRESS
|
||||||
|
from pathlib import Path
|
||||||
|
from distutils.util import strtobool
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
|
||||||
|
from soupjobs import CONFIG_FILE_ENV_VAR
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = ArgumentParser(
|
||||||
|
argument_default=SUPPRESS,
|
||||||
|
description="CLI tool for starting scraping jobs. "
|
||||||
|
"The provided settings always override environment variables and those in the config file. "
|
||||||
|
"For more detailed descriptions of options refer to the example config file."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-c', '--config-file',
|
||||||
|
type=Path,
|
||||||
|
default=Path(Path(__file__).parent, 'config.yaml'),
|
||||||
|
help="Specify a different config file. Defaults to 'config.yaml' in the same directory as this run script."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'entry_urls',
|
||||||
|
type=str,
|
||||||
|
nargs='*',
|
||||||
|
help="Can be a url, list of urls or path to a text file with urls"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-r', '--regex-mode',
|
||||||
|
action='store_true',
|
||||||
|
help="Set this flag to treat all filter strings as regular expressions"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-d', '--max-depth',
|
||||||
|
type=int,
|
||||||
|
help="Maximum recursion depth for following matching links to other pages"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-p', '--max-pages',
|
||||||
|
type=int,
|
||||||
|
help="Maximum number of pages to visit"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--output-with-urls',
|
||||||
|
action='store_true',
|
||||||
|
help="Set this flag to map scraped results to the url of the page they were found on"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-o', '--output-format',
|
||||||
|
type=str,
|
||||||
|
help="Set to either 'yaml', 'json' or 'simple' format"
|
||||||
|
)
|
||||||
|
kwargs = vars(parser.parse_args())
|
||||||
|
os.environ[CONFIG_FILE_ENV_VAR] = str(kwargs.pop('config_file'))
|
||||||
|
from soupjobs.settings import settings
|
||||||
|
from soupjobs.scrape import Job
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
print(key, value, type(value))
|
||||||
|
setattr(settings, key, value)
|
||||||
|
if not settings.entry_urls:
|
||||||
|
print("No urls specified")
|
||||||
|
exit()
|
||||||
|
if not settings.has_target_filters():
|
||||||
|
warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \
|
||||||
|
"Are you sure you want to proceed? (y/n)"
|
||||||
|
proceed = strtobool(input(warning))
|
||||||
|
if not proceed:
|
||||||
|
print("Cancelled")
|
||||||
|
exit()
|
||||||
|
asyncio.run(Job().start())
|
209
src/soupjobs/scrape.py
Normal file
209
src/soupjobs/scrape.py
Normal file
@ -0,0 +1,209 @@
|
|||||||
|
from typing import Tuple, List, Dict, Callable, Any
|
||||||
|
from urllib.parse import urlparse, urljoin
|
||||||
|
import asyncio
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
|
||||||
|
from aiohttp import ClientSession
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import Tag
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from soupjobs.settings import settings, OutputFormat, TagAttrs
|
||||||
|
|
||||||
|
|
||||||
|
BS_PARSER = 'html.parser'
|
||||||
|
|
||||||
|
|
||||||
|
class Job:
|
||||||
|
def __init__(self, session: ClientSession = None):
|
||||||
|
self.loop = None
|
||||||
|
self.session = session
|
||||||
|
self.lock = None
|
||||||
|
self.results: Dict[str, List[Any]] = {}
|
||||||
|
self.num_targets_found: int = 0
|
||||||
|
self.num_links_followed: int = 0
|
||||||
|
self.page_counter: int = 0
|
||||||
|
self.entry_urls: List[str] = []
|
||||||
|
urls = [settings.entry_urls] if isinstance(settings.entry_urls, str) else settings.entry_urls
|
||||||
|
for url in urls:
|
||||||
|
if is_valid_url(url):
|
||||||
|
self.entry_urls.append(url)
|
||||||
|
else:
|
||||||
|
with open(url, 'r') as f:
|
||||||
|
self.entry_urls += f.readlines()
|
||||||
|
|
||||||
|
async def init_async(self) -> None:
|
||||||
|
self.loop = asyncio.get_event_loop()
|
||||||
|
self.session = ClientSession(loop=self.loop)
|
||||||
|
self.lock = asyncio.Lock()
|
||||||
|
|
||||||
|
async def start(self) -> None:
|
||||||
|
await self.init_async()
|
||||||
|
print_start()
|
||||||
|
try:
|
||||||
|
await self.run(*self.entry_urls)
|
||||||
|
finally:
|
||||||
|
await self.session.close()
|
||||||
|
print_end()
|
||||||
|
|
||||||
|
async def run(self, *urls: str, depth: int = 0) -> None:
|
||||||
|
async with self.lock:
|
||||||
|
urls = list(set(urls).difference(*self.results.keys()))
|
||||||
|
if settings.max_pages:
|
||||||
|
num_requests_left = settings.max_pages - self.num_links_followed
|
||||||
|
if num_requests_left <= 0:
|
||||||
|
return
|
||||||
|
urls = urls[:num_requests_left]
|
||||||
|
self.num_links_followed += len(urls)
|
||||||
|
output = await asyncio.gather(*(self.get_and_scrape(url) for url in urls), loop=self.loop)
|
||||||
|
assert isinstance(output, list)
|
||||||
|
next_links = construct_next_urls(urls, output)
|
||||||
|
if depth < settings.max_depth:
|
||||||
|
await self.run(*next_links, depth=depth + 1)
|
||||||
|
|
||||||
|
async def get_and_scrape(self, url: str) -> List[str]:
|
||||||
|
async with self.lock:
|
||||||
|
if settings.target_limit and self.num_targets_found >= settings.target_limit:
|
||||||
|
return []
|
||||||
|
async with self.session.get(url) as response:
|
||||||
|
html = await response.text()
|
||||||
|
targets, links = scrape_document(html)
|
||||||
|
async with self.lock:
|
||||||
|
self.page_counter += 1
|
||||||
|
num_targets_left = settings.target_limit - self.num_targets_found if settings.target_limit else None
|
||||||
|
targets = targets[:num_targets_left]
|
||||||
|
self.results[url] = targets
|
||||||
|
self.num_targets_found += len(targets)
|
||||||
|
print_page_results(url, targets, self.limit_reached())
|
||||||
|
return links
|
||||||
|
|
||||||
|
def limit_reached(self) -> bool:
|
||||||
|
if settings.max_pages and self.page_counter >= settings.max_pages:
|
||||||
|
return True
|
||||||
|
if settings.target_limit and self.num_targets_found >= settings.target_limit:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_document(html: str) -> Tuple[List[Any], List[str]]:
|
||||||
|
soup = BeautifulSoup(html, BS_PARSER)
|
||||||
|
# Targets:
|
||||||
|
targets = []
|
||||||
|
for tag in soup.find_all(target_filter, limit=settings.target_limit):
|
||||||
|
target = tag.text if settings.target_extract_text else str(tag)
|
||||||
|
if settings.target_transform:
|
||||||
|
target = settings.target_transform(target)
|
||||||
|
targets.append(target)
|
||||||
|
# Next links:
|
||||||
|
if settings.next_link_random:
|
||||||
|
links = soup.find_all(link_filter)
|
||||||
|
if settings.next_link_limit and settings.next_link_limit < len(links):
|
||||||
|
indices = list(range(len(links)))
|
||||||
|
random.shuffle(indices)
|
||||||
|
links = links[:settings.next_link_limit]
|
||||||
|
else:
|
||||||
|
links = soup.find_all(link_filter, limit=settings.next_link_limit)
|
||||||
|
return targets, [a['href'] for a in links]
|
||||||
|
|
||||||
|
|
||||||
|
def link_filter(tag: Tag) -> bool:
|
||||||
|
try:
|
||||||
|
if not string_matches(tag['href'], settings.next_link_href, settings.regex_mode):
|
||||||
|
return False
|
||||||
|
except KeyError:
|
||||||
|
return False
|
||||||
|
return tag_filter(tag, 'a', text=settings.next_link_text, attrs=settings.next_link_attrs,
|
||||||
|
func=settings.next_link_match_func, regex=settings.regex_mode)
|
||||||
|
|
||||||
|
|
||||||
|
def target_filter(tag: Tag) -> bool:
|
||||||
|
return tag_filter(tag, settings.target_tag, text=settings.target_text, attrs=settings.target_attrs,
|
||||||
|
func=settings.target_match_func, regex=settings.regex_mode)
|
||||||
|
|
||||||
|
|
||||||
|
def tag_filter(tag: Tag, name: str = None, text: str = None, attrs: TagAttrs = None, func: Callable[[Tag], bool] = None,
|
||||||
|
regex: bool = False) -> bool:
|
||||||
|
"""
|
||||||
|
Returns `True` only if the `tag` matches all provided filter criteria.
|
||||||
|
Built to be used in the `find_all` method from BeautifulSoup4.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag:
|
||||||
|
The BS4 Tag object to check
|
||||||
|
name (optional):
|
||||||
|
What kind of tag will be matched (e.g. 'a' would match an HTML anchor tag)
|
||||||
|
text (optional):
|
||||||
|
The text enclosed by the tag to be matched
|
||||||
|
func (optional):
|
||||||
|
Function to run on the tag for filtering (should return `True` if the tag matches)
|
||||||
|
attrs (optional):
|
||||||
|
Any additional attributes the tag should match, e.g. {'class': 'relevant'}
|
||||||
|
regex (optional):
|
||||||
|
If `True`, all checks are performed by matching the tag's attributes using the provided arguments as
|
||||||
|
a regular expression, otherwise they are checked for string equality.
|
||||||
|
"""
|
||||||
|
if not string_matches(tag.name, name, regex):
|
||||||
|
return False
|
||||||
|
if not string_matches(tag.text, text, regex):
|
||||||
|
return False
|
||||||
|
for attr_name, attr_value in attrs.items():
|
||||||
|
try:
|
||||||
|
if not string_matches(tag[attr_name], attr_value, regex):
|
||||||
|
return False
|
||||||
|
except KeyError:
|
||||||
|
return False
|
||||||
|
if func:
|
||||||
|
return func(tag)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def string_matches(search_string: str, expression: str = None, regex: bool = False) -> bool:
|
||||||
|
if expression is None:
|
||||||
|
return True
|
||||||
|
if not regex:
|
||||||
|
return search_string == expression
|
||||||
|
return re.compile(expression).search(search_string) is not None
|
||||||
|
|
||||||
|
|
||||||
|
def construct_next_urls(urls: List[str], next_links_lists: List[List[str]]) -> List[str]:
|
||||||
|
output = set()
|
||||||
|
for url, next_links in zip(urls, next_links_lists):
|
||||||
|
for link in next_links:
|
||||||
|
output.add(urljoin(url, link))
|
||||||
|
return list(output)
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_url(string: str) -> bool:
|
||||||
|
parsed = urlparse(string)
|
||||||
|
if not all([parsed.scheme, parsed.netloc]):
|
||||||
|
return False
|
||||||
|
if parsed.scheme not in ('http', 'https'):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def print_start() -> None:
|
||||||
|
if settings.output_format == OutputFormat.json:
|
||||||
|
print('{') if settings.output_with_urls else print('[')
|
||||||
|
|
||||||
|
|
||||||
|
def print_end() -> None:
|
||||||
|
if settings.output_format == OutputFormat.json:
|
||||||
|
print('}') if settings.output_with_urls else print(']')
|
||||||
|
|
||||||
|
|
||||||
|
def print_page_results(url: str, targets: List[str], last: bool = False) -> None:
|
||||||
|
if settings.output_format == OutputFormat.yaml:
|
||||||
|
output = yaml.safe_dump({url: targets} if settings.output_with_urls else targets)
|
||||||
|
end = ''
|
||||||
|
elif settings.output_format == OutputFormat.json:
|
||||||
|
output = json.dumps({url: targets} if settings.output_with_urls else targets, indent=2)[2:-2]
|
||||||
|
end = '\n' if last else ',\n'
|
||||||
|
else:
|
||||||
|
output = '\n'.join(targets)
|
||||||
|
if settings.output_with_urls:
|
||||||
|
output = url + ':\n' + output
|
||||||
|
end = '\n'
|
||||||
|
print(output, end=end)
|
74
src/soupjobs/settings.py
Normal file
74
src/soupjobs/settings.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
from typing import Dict, List, Union, Optional, Any
|
||||||
|
from pathlib import Path
|
||||||
|
from enum import Enum
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pydantic import BaseSettings, PyObject
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
|
||||||
|
|
||||||
|
|
||||||
|
TagAttrs = Dict[str, str]
|
||||||
|
|
||||||
|
|
||||||
|
class OutputFormat(str, Enum):
|
||||||
|
simple = 'simple'
|
||||||
|
yaml = 'yaml'
|
||||||
|
json = 'json'
|
||||||
|
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
_CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
|
||||||
|
|
||||||
|
entry_urls: Union[str, List[str]] = []
|
||||||
|
regex_mode: bool = False
|
||||||
|
max_depth: int = 0
|
||||||
|
max_pages: Optional[int]
|
||||||
|
output_with_urls: bool = False
|
||||||
|
output_format: OutputFormat = OutputFormat.simple
|
||||||
|
|
||||||
|
# Target to extract:
|
||||||
|
target_tag: Optional[str]
|
||||||
|
target_text: Optional[str]
|
||||||
|
target_attrs: TagAttrs = {}
|
||||||
|
target_match_func: Optional[PyObject]
|
||||||
|
target_required: bool = False
|
||||||
|
target_limit: Optional[int]
|
||||||
|
target_extract_text: bool = True
|
||||||
|
target_transform: Optional[PyObject]
|
||||||
|
|
||||||
|
# Link to next page:
|
||||||
|
next_link_text: Optional[str]
|
||||||
|
next_link_href: Optional[str]
|
||||||
|
next_link_attrs: TagAttrs = {}
|
||||||
|
next_link_match_func: Optional[PyObject]
|
||||||
|
next_link_limit: Optional[int]
|
||||||
|
next_link_random: bool = True
|
||||||
|
|
||||||
|
def has_target_filters(self) -> bool:
|
||||||
|
return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def yaml_settings(_settings_obj: BaseSettings) -> Dict[str, Any]:
|
||||||
|
try:
|
||||||
|
with open(Settings._CONFIG_FILE, 'r') as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
return {}
|
||||||
|
for section_name in ('target', 'next_link'):
|
||||||
|
section = config.pop(section_name, {})
|
||||||
|
for key, value in section.items():
|
||||||
|
config.setdefault(f'{section_name}_{key}', value)
|
||||||
|
return config
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
validate_assignment = True
|
||||||
|
env_file_encoding = 'utf-8'
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def customise_sources(cls, init_settings, env_settings, file_secret_settings):
|
||||||
|
return init_settings, env_settings, file_secret_settings, Settings.yaml_settings
|
||||||
|
|
||||||
|
|
||||||
|
settings = Settings()
|
Loading…
Reference in New Issue
Block a user