minor optimizations

This commit is contained in:
Daniil Fajnberg 2021-07-25 14:45:07 +02:00
parent 5ea0a55bcb
commit 529c1a452c
6 changed files with 39 additions and 47 deletions

View File

@ -34,4 +34,4 @@ tests =
where = src
[options.package_data]
soupjobs = example.config.yaml
soupjobs = default.config.yaml

View File

@ -1,2 +1,4 @@
SOUPJOBS = 'soupjobs'
CONFIG_FILE_ENV_VAR = 'SOUPJOBS_CONFIG'
CONFIG_FILE_PLACEHOLDER = 'placeholder'
CONFIG_FILE_DEFAULT_NAME = 'config.yaml'

View File

@ -11,21 +11,15 @@
# if specified as a string that has valid url format, the corresponding page is visited and scraped;
# otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped.
entry_urls:
- https://en.wikipedia.org/wiki/Python_(programming_language)
- https://en.wikipedia.org/wiki/Guido_van_Rossum
# If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as
# regular expressions to match the attribute values, otherwise they are simply checked for string equality.
# Default:
#regex_mode: False
regex_mode: True
regex_mode: False
# The maximum recursion depth for following matching links to other pages starting from the `entry_urls`.
# For example, a `max_depth` of 2 means that for every entry page the matching links may be followed,
# and then for every one of those, the matching links may be followed again, but then the job would end.
# Default:
#max_depth: 0
max_depth: 2
max_depth: 0
# Maximum number of pages to visit.
# Example:
@ -33,14 +27,12 @@ max_depth: 2
# If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values;
# otherwise the output is merely a list of all extracted matches.
# Default:
#output_with_urls: False
output_with_urls: False
# Output can be produced in either 'yaml', 'json' or 'simple' format.
# The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and
# add the url of the page they were extracted from in an additional line before them if that setting is `True`.
# Default:
#output_format: simple
output_format: simple
# Target section
@ -51,16 +43,16 @@ max_depth: 2
target:
# Filter by HTML tag
# Example to only look for <h1> tags:
tag: h1
#tag: h1
# Filter by text inside the tag
# Example:
#text: program
# Filter by any valid HTML attributes
attrs:
# Examples:
id: firstHeading
# Example:
#attrs:
#id: firstHeading
#class: foo
#role: bar
@ -70,18 +62,16 @@ target:
#match_func: module.function
# If this is set to `True` and no matching tags are found on a page, an exception is raised.
# Default:
#required: False
required: False
# Stop doing requests as soon as possible when this number of matches were extracted.
# Note that setting this parameter will restrict the number of returned targets to no more than is set here,
# but in asynchronous execution the total number of requests made and targets scraped may be higher.
# Example:
limit: 20
#limit: 20
# If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted.
# Default:
#extract_text: True
extract_text: True
# Optional transformation function to apply to every matching target tag.
# Should take a BS4 Tag object as its sole argument and return a string.
@ -102,7 +92,7 @@ next_link:
# Filter by the `href` attribute of the anchor tag.
# Example:
href: '^/wiki/\w+'
#href: '^/wiki/\w+'
# Filter by any other valid HTML attributes
# Example:
@ -116,9 +106,8 @@ next_link:
# Get at most this many links to other pages from one page.
# Example:
limit: 10
#limit: 10
# If `True`, and a limit is set that is below the number of matches on one page,
# the links are chosen at random. Otherwise the first `limit` number are chosen.
# Default:
#random: True
random: True

View File

@ -4,7 +4,7 @@ from distutils.util import strtobool
import asyncio
import os
from soupjobs import CONFIG_FILE_ENV_VAR
from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_DEFAULT_NAME
if __name__ == '__main__':
@ -17,8 +17,8 @@ if __name__ == '__main__':
parser.add_argument(
'-c', '--config-file',
type=Path,
default=Path(Path(__file__).parent, 'config.yaml'),
help="Specify a different config file. Defaults to 'config.yaml' in the same directory as this run script."
default=Path(Path.cwd(), CONFIG_FILE_DEFAULT_NAME),
help="Specify a different config file path. Defaults to 'config.yaml' in the current working directory."
)
parser.add_argument(
'entry_urls',
@ -64,7 +64,7 @@ if __name__ == '__main__':
if not settings.has_target_filters():
warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \
"Are you sure you want to proceed? (y/n)"
proceed = strtobool(input(warning))
proceed = strtobool(input(warning).lower())
if not proceed:
print("Cancelled")
exit()

View File

@ -10,7 +10,7 @@ from bs4 import BeautifulSoup
from bs4.element import Tag
import yaml
from soupjobs.settings import settings, OutputFormat, TagAttrs
from .settings import settings, OutputFormat, TagAttrs
BS_PARSER = 'html.parser'

View File

@ -6,7 +6,7 @@ import os
from pydantic import BaseSettings, PyObject
import yaml
from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
from soupjobs import SOUPJOBS, CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
TagAttrs = Dict[str, str]
@ -19,7 +19,7 @@ class OutputFormat(str, Enum):
class Settings(BaseSettings):
_CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
entry_urls: Union[str, List[str]] = []
regex_mode: bool = False
@ -49,26 +49,27 @@ class Settings(BaseSettings):
def has_target_filters(self) -> bool:
return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
@staticmethod
def yaml_settings(_settings_obj: BaseSettings) -> Dict[str, Any]:
try:
with open(Settings._CONFIG_FILE, 'r') as f:
config = yaml.safe_load(f)
except FileNotFoundError:
return {}
for section_name in ('target', 'next_link'):
section = config.pop(section_name, {})
for key, value in section.items():
config.setdefault(f'{section_name}_{key}', value)
return config
class Config:
validate_assignment = True
env_prefix = SOUPJOBS + '_'
env_file_encoding = 'utf-8'
@classmethod
def customise_sources(cls, init_settings, env_settings, file_secret_settings):
return init_settings, env_settings, file_secret_settings, Settings.yaml_settings
return init_settings, env_settings, file_secret_settings, yaml_settings
def yaml_settings(settings_obj: Settings) -> Dict[str, Any]:
try:
with open(settings_obj.CONFIG_FILE, 'r') as f:
config = yaml.safe_load(f)
except FileNotFoundError:
return {}
for section_name in ('target', 'next_link'):
section = config.pop(section_name, {})
for key, value in section.items():
config.setdefault(f'{section_name}_{key}', value)
return config
settings = Settings()