diff --git a/setup.cfg b/setup.cfg
index f6dcff5..89c26d9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,4 +34,4 @@ tests =
where = src
[options.package_data]
-soupjobs = example.config.yaml
+soupjobs = default.config.yaml
diff --git a/src/soupjobs/__init__.py b/src/soupjobs/__init__.py
index 9c9ee07..e80b3c8 100644
--- a/src/soupjobs/__init__.py
+++ b/src/soupjobs/__init__.py
@@ -1,2 +1,4 @@
+SOUPJOBS = 'soupjobs'
CONFIG_FILE_ENV_VAR = 'SOUPJOBS_CONFIG'
CONFIG_FILE_PLACEHOLDER = 'placeholder'
+CONFIG_FILE_DEFAULT_NAME = 'config.yaml'
diff --git a/src/soupjobs/example.config.yaml b/src/soupjobs/default.config.yaml
similarity index 89%
rename from src/soupjobs/example.config.yaml
rename to src/soupjobs/default.config.yaml
index deb3774..1ebc033 100644
--- a/src/soupjobs/example.config.yaml
+++ b/src/soupjobs/default.config.yaml
@@ -11,21 +11,15 @@
# if specified as a string that has valid url format, the corresponding page is visited and scraped;
# otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped.
entry_urls:
- - https://en.wikipedia.org/wiki/Python_(programming_language)
- - https://en.wikipedia.org/wiki/Guido_van_Rossum
# If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as
# regular expressions to match the attribute values, otherwise they are simply checked for string equality.
-# Default:
-#regex_mode: False
-regex_mode: True
+regex_mode: False
# The maximum recursion depth for following matching links to other pages starting from the `entry_urls`.
# For example, a `max_depth` of 2 means that for every entry page the matching links may be followed,
# and then for every one of those, the matching links may be followed again, but then the job would end.
-# Default:
-#max_depth: 0
-max_depth: 2
+max_depth: 0
# Maximum number of pages to visit.
# Example:
@@ -33,14 +27,12 @@ max_depth: 2
# If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values;
# otherwise the output is merely a list of all extracted matches.
-# Default:
-#output_with_urls: False
+output_with_urls: False
# Output can be produced in either 'yaml', 'json' or 'simple' format.
# The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and
# add the url of the page they were extracted from in an additional line before them if that setting is `True`.
-# Default:
-#output_format: simple
+output_format: simple
# Target section
@@ -51,16 +43,16 @@ max_depth: 2
target:
# Filter by HTML tag
# Example to only look for
tags:
- tag: h1
+ #tag: h1
# Filter by text inside the tag
# Example:
#text: program
# Filter by any valid HTML attributes
- attrs:
- # Examples:
- id: firstHeading
+ # Example:
+ #attrs:
+ #id: firstHeading
#class: foo
#role: bar
@@ -70,18 +62,16 @@ target:
#match_func: module.function
# If this is set to `True` and no matching tags are found on a page, an exception is raised.
- # Default:
- #required: False
+ required: False
# Stop doing requests as soon as possible when this number of matches were extracted.
# Note that setting this parameter will restrict the number of returned targets to no more than is set here,
# but in asynchronous execution the total number of requests made and targets scraped may be higher.
# Example:
- limit: 20
+ #limit: 20
# If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted.
- # Default:
- #extract_text: True
+ extract_text: True
# Optional transformation function to apply to every matching target tag.
# Should take a BS4 Tag object as its sole argument and return a string.
@@ -102,7 +92,7 @@ next_link:
# Filter by the `href` attribute of the anchor tag.
# Example:
- href: '^/wiki/\w+'
+ #href: '^/wiki/\w+'
# Filter by any other valid HTML attributes
# Example:
@@ -116,9 +106,8 @@ next_link:
# Get at most this many links to other pages from one page.
# Example:
- limit: 10
+ #limit: 10
# If `True`, and a limit is set that is below the number of matches on one page,
# the links are chosen at random. Otherwise the first `limit` number are chosen.
- # Default:
- #random: True
+ random: True
diff --git a/src/soupjobs/run.py b/src/soupjobs/run.py
index ebc9100..33d27ed 100644
--- a/src/soupjobs/run.py
+++ b/src/soupjobs/run.py
@@ -4,7 +4,7 @@ from distutils.util import strtobool
import asyncio
import os
-from soupjobs import CONFIG_FILE_ENV_VAR
+from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_DEFAULT_NAME
if __name__ == '__main__':
@@ -17,8 +17,8 @@ if __name__ == '__main__':
parser.add_argument(
'-c', '--config-file',
type=Path,
- default=Path(Path(__file__).parent, 'config.yaml'),
- help="Specify a different config file. Defaults to 'config.yaml' in the same directory as this run script."
+ default=Path(Path.cwd(), CONFIG_FILE_DEFAULT_NAME),
+ help="Specify a different config file path. Defaults to 'config.yaml' in the current working directory."
)
parser.add_argument(
'entry_urls',
@@ -64,7 +64,7 @@ if __name__ == '__main__':
if not settings.has_target_filters():
warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \
"Are you sure you want to proceed? (y/n)"
- proceed = strtobool(input(warning))
+ proceed = strtobool(input(warning).lower())
if not proceed:
print("Cancelled")
exit()
diff --git a/src/soupjobs/scrape.py b/src/soupjobs/scrape.py
index 4a1d705..b6c1177 100644
--- a/src/soupjobs/scrape.py
+++ b/src/soupjobs/scrape.py
@@ -10,7 +10,7 @@ from bs4 import BeautifulSoup
from bs4.element import Tag
import yaml
-from soupjobs.settings import settings, OutputFormat, TagAttrs
+from .settings import settings, OutputFormat, TagAttrs
BS_PARSER = 'html.parser'
diff --git a/src/soupjobs/settings.py b/src/soupjobs/settings.py
index 8699fe6..296bc5d 100644
--- a/src/soupjobs/settings.py
+++ b/src/soupjobs/settings.py
@@ -6,7 +6,7 @@ import os
from pydantic import BaseSettings, PyObject
import yaml
-from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
+from soupjobs import SOUPJOBS, CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
TagAttrs = Dict[str, str]
@@ -19,7 +19,7 @@ class OutputFormat(str, Enum):
class Settings(BaseSettings):
- _CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
+ CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
entry_urls: Union[str, List[str]] = []
regex_mode: bool = False
@@ -49,26 +49,27 @@ class Settings(BaseSettings):
def has_target_filters(self) -> bool:
return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
- @staticmethod
- def yaml_settings(_settings_obj: BaseSettings) -> Dict[str, Any]:
- try:
- with open(Settings._CONFIG_FILE, 'r') as f:
- config = yaml.safe_load(f)
- except FileNotFoundError:
- return {}
- for section_name in ('target', 'next_link'):
- section = config.pop(section_name, {})
- for key, value in section.items():
- config.setdefault(f'{section_name}_{key}', value)
- return config
-
class Config:
validate_assignment = True
+ env_prefix = SOUPJOBS + '_'
env_file_encoding = 'utf-8'
@classmethod
def customise_sources(cls, init_settings, env_settings, file_secret_settings):
- return init_settings, env_settings, file_secret_settings, Settings.yaml_settings
+ return init_settings, env_settings, file_secret_settings, yaml_settings
+
+
+def yaml_settings(settings_obj: Settings) -> Dict[str, Any]:
+ try:
+ with open(settings_obj.CONFIG_FILE, 'r') as f:
+ config = yaml.safe_load(f)
+ except FileNotFoundError:
+ return {}
+ for section_name in ('target', 'next_link'):
+ section = config.pop(section_name, {})
+ for key, value in section.items():
+ config.setdefault(f'{section_name}_{key}', value)
+ return config
settings = Settings()