minor optimizations

2021-07-25 14:45:07 +02:00
parent 5ea0a55bcb
commit 529c1a452c
6 changed files with 39 additions and 47 deletions
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,4 +34,4 @@ tests =
 where = src
 [options.package_data]
-soupjobs = example.config.yaml
+soupjobs = default.config.yaml
--- a/src/soupjobs/init.py
+++ b/src/soupjobs/init.py
@@ -1,2 +1,4 @@
 SOUPJOBS = 'soupjobs'
 CONFIG_FILE_ENV_VAR = 'SOUPJOBS_CONFIG'
 CONFIG_FILE_PLACEHOLDER = 'placeholder'
 CONFIG_FILE_DEFAULT_NAME = 'config.yaml'
--- a/src/soupjobs/default.config.yaml
+++ b/src/soupjobs/default.config.yaml
@@ -11,21 +11,15 @@
 # if specified as a string that has valid url format, the corresponding page is visited and scraped;
 # otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped.
 entry_urls:
  - https://en.wikipedia.org/wiki/Python_(programming_language)
  - https://en.wikipedia.org/wiki/Guido_van_Rossum
 # If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as
 # regular expressions to match the attribute values, otherwise they are simply checked for string equality.
-# Default:
+regex_mode: False
 #regex_mode: False
 regex_mode: True
 # The maximum recursion depth for following matching links to other pages starting from the `entry_urls`.
 # For example, a `max_depth` of 2 means that for every entry page the matching links may be followed,
 # and then for every one of those, the matching links may be followed again, but then the job would end.
-# Default:
+max_depth: 0
 #max_depth: 0
 max_depth: 2
 # Maximum number of pages to visit.
 # Example:
@@ -33,14 +27,12 @@ max_depth: 2
 # If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values;
 # otherwise the output is merely a list of all extracted matches.
-# Default:
+output_with_urls: False
 #output_with_urls: False
 # Output can be produced in either 'yaml', 'json' or 'simple' format.
 # The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and
 # add the url of the page they were extracted from in an additional line before them if that setting is `True`.
-# Default:
+output_format: simple
 #output_format: simple
 # Target section
@@ -51,16 +43,16 @@ max_depth: 2
 target:
  # Filter by HTML tag
  # Example to only look for <h1> tags:
-  tag: h1
+  #tag: h1
  # Filter by text inside the tag
  # Example:
  #text: program
  # Filter by any valid HTML attributes
-  attrs:
+  # Example:
-    # Examples:
+  #attrs:
-    id: firstHeading
+    #id: firstHeading
    #class: foo
    #role: bar
@@ -70,18 +62,16 @@ target:
  #match_func: module.function
  # If this is set to `True` and no matching tags are found on a page, an exception is raised.
-  # Default:
+  required: False
  #required: False
  # Stop doing requests as soon as possible when this number of matches were extracted.
  # Note that setting this parameter will restrict the number of returned targets to no more than is set here,
  # but in asynchronous execution the total number of requests made and targets scraped may be higher.
  # Example:
-  limit: 20
+  #limit: 20
  # If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted.
-  # Default:
+  extract_text: True
  #extract_text: True
  # Optional transformation function to apply to every matching target tag.
  # Should take a BS4 Tag object as its sole argument and return a string.
@@ -102,7 +92,7 @@ next_link:
  # Filter by the `href` attribute of the anchor tag.
  # Example:
-  href: '^/wiki/\w+'
+  #href: '^/wiki/\w+'
  # Filter by any other valid HTML attributes
  # Example:
@@ -116,9 +106,8 @@ next_link:
  # Get at most this many links to other pages from one page.
  # Example:
-  limit: 10
+  #limit: 10
  # If `True`, and a limit is set that is below the number of matches on one page,
  # the links are chosen at random. Otherwise the first `limit` number are chosen.
-  # Default:
+  random: True
  #random: True
--- a/src/soupjobs/run.py
+++ b/src/soupjobs/run.py
@@ -4,7 +4,7 @@ from distutils.util import strtobool
 import asyncio
 import os
-from soupjobs import CONFIG_FILE_ENV_VAR
+from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_DEFAULT_NAME
 if __name__ == '__main__':
@@ -17,8 +17,8 @@ if __name__ == '__main__':
    parser.add_argument(
        '-c', '--config-file',
        type=Path,
-        default=Path(Path(__file__).parent, 'config.yaml'),
+        default=Path(Path.cwd(), CONFIG_FILE_DEFAULT_NAME),
-        help="Specify a different config file. Defaults to 'config.yaml' in the same directory as this run script."
+        help="Specify a different config file path. Defaults to 'config.yaml' in the current working directory."
    )
    parser.add_argument(
        'entry_urls',
@@ -64,7 +64,7 @@ if __name__ == '__main__':
    if not settings.has_target_filters():
        warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \
                  "Are you sure you want to proceed? (y/n)"
-        proceed = strtobool(input(warning))
+        proceed = strtobool(input(warning).lower())
        if not proceed:
            print("Cancelled")
            exit()
--- a/src/soupjobs/scrape.py
+++ b/src/soupjobs/scrape.py
@@ -10,7 +10,7 @@ from bs4 import BeautifulSoup
 from bs4.element import Tag
 import yaml
-from soupjobs.settings import settings, OutputFormat, TagAttrs
+from .settings import settings, OutputFormat, TagAttrs
 BS_PARSER = 'html.parser'
--- a/src/soupjobs/settings.py
+++ b/src/soupjobs/settings.py
@@ -6,7 +6,7 @@ import os
 from pydantic import BaseSettings, PyObject
 import yaml
-from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
+from soupjobs import SOUPJOBS, CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
 TagAttrs = Dict[str, str]
@@ -19,7 +19,7 @@ class OutputFormat(str, Enum):
 class Settings(BaseSettings):
-    _CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
+    CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
    entry_urls: Union[str, List[str]] = []
    regex_mode: bool = False
@@ -49,10 +49,19 @@ class Settings(BaseSettings):
    def has_target_filters(self) -> bool:
        return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
-    @staticmethod
+    class Config:
-    def yaml_settings(_settings_obj: BaseSettings) -> Dict[str, Any]:
+        validate_assignment = True
        env_prefix = SOUPJOBS + '_'
        env_file_encoding = 'utf-8'
        @classmethod
        def customise_sources(cls, init_settings, env_settings, file_secret_settings):
            return init_settings, env_settings, file_secret_settings, yaml_settings
 def yaml_settings(settings_obj: Settings) -> Dict[str, Any]:
    try:
-            with open(Settings._CONFIG_FILE, 'r') as f:
+        with open(settings_obj.CONFIG_FILE, 'r') as f:
            config = yaml.safe_load(f)
    except FileNotFoundError:
        return {}
@@ -62,13 +71,5 @@ class Settings(BaseSettings):
            config.setdefault(f'{section_name}_{key}', value)
    return config
    class Config:
        validate_assignment = True
        env_file_encoding = 'utf-8'
        @classmethod
        def customise_sources(cls, init_settings, env_settings, file_secret_settings):
            return init_settings, env_settings, file_secret_settings, Settings.yaml_settings
 settings = Settings()