minor optimizations
This commit is contained in:
parent
5ea0a55bcb
commit
529c1a452c
@ -34,4 +34,4 @@ tests =
|
|||||||
where = src
|
where = src
|
||||||
|
|
||||||
[options.package_data]
|
[options.package_data]
|
||||||
soupjobs = example.config.yaml
|
soupjobs = default.config.yaml
|
||||||
|
@ -1,2 +1,4 @@
|
|||||||
|
SOUPJOBS = 'soupjobs'
|
||||||
CONFIG_FILE_ENV_VAR = 'SOUPJOBS_CONFIG'
|
CONFIG_FILE_ENV_VAR = 'SOUPJOBS_CONFIG'
|
||||||
CONFIG_FILE_PLACEHOLDER = 'placeholder'
|
CONFIG_FILE_PLACEHOLDER = 'placeholder'
|
||||||
|
CONFIG_FILE_DEFAULT_NAME = 'config.yaml'
|
||||||
|
@ -11,21 +11,15 @@
|
|||||||
# if specified as a string that has valid url format, the corresponding page is visited and scraped;
|
# if specified as a string that has valid url format, the corresponding page is visited and scraped;
|
||||||
# otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped.
|
# otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped.
|
||||||
entry_urls:
|
entry_urls:
|
||||||
- https://en.wikipedia.org/wiki/Python_(programming_language)
|
|
||||||
- https://en.wikipedia.org/wiki/Guido_van_Rossum
|
|
||||||
|
|
||||||
# If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as
|
# If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as
|
||||||
# regular expressions to match the attribute values, otherwise they are simply checked for string equality.
|
# regular expressions to match the attribute values, otherwise they are simply checked for string equality.
|
||||||
# Default:
|
regex_mode: False
|
||||||
#regex_mode: False
|
|
||||||
regex_mode: True
|
|
||||||
|
|
||||||
# The maximum recursion depth for following matching links to other pages starting from the `entry_urls`.
|
# The maximum recursion depth for following matching links to other pages starting from the `entry_urls`.
|
||||||
# For example, a `max_depth` of 2 means that for every entry page the matching links may be followed,
|
# For example, a `max_depth` of 2 means that for every entry page the matching links may be followed,
|
||||||
# and then for every one of those, the matching links may be followed again, but then the job would end.
|
# and then for every one of those, the matching links may be followed again, but then the job would end.
|
||||||
# Default:
|
max_depth: 0
|
||||||
#max_depth: 0
|
|
||||||
max_depth: 2
|
|
||||||
|
|
||||||
# Maximum number of pages to visit.
|
# Maximum number of pages to visit.
|
||||||
# Example:
|
# Example:
|
||||||
@ -33,14 +27,12 @@ max_depth: 2
|
|||||||
|
|
||||||
# If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values;
|
# If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values;
|
||||||
# otherwise the output is merely a list of all extracted matches.
|
# otherwise the output is merely a list of all extracted matches.
|
||||||
# Default:
|
output_with_urls: False
|
||||||
#output_with_urls: False
|
|
||||||
|
|
||||||
# Output can be produced in either 'yaml', 'json' or 'simple' format.
|
# Output can be produced in either 'yaml', 'json' or 'simple' format.
|
||||||
# The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and
|
# The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and
|
||||||
# add the url of the page they were extracted from in an additional line before them if that setting is `True`.
|
# add the url of the page they were extracted from in an additional line before them if that setting is `True`.
|
||||||
# Default:
|
output_format: simple
|
||||||
#output_format: simple
|
|
||||||
|
|
||||||
|
|
||||||
# Target section
|
# Target section
|
||||||
@ -51,16 +43,16 @@ max_depth: 2
|
|||||||
target:
|
target:
|
||||||
# Filter by HTML tag
|
# Filter by HTML tag
|
||||||
# Example to only look for <h1> tags:
|
# Example to only look for <h1> tags:
|
||||||
tag: h1
|
#tag: h1
|
||||||
|
|
||||||
# Filter by text inside the tag
|
# Filter by text inside the tag
|
||||||
# Example:
|
# Example:
|
||||||
#text: program
|
#text: program
|
||||||
|
|
||||||
# Filter by any valid HTML attributes
|
# Filter by any valid HTML attributes
|
||||||
attrs:
|
# Example:
|
||||||
# Examples:
|
#attrs:
|
||||||
id: firstHeading
|
#id: firstHeading
|
||||||
#class: foo
|
#class: foo
|
||||||
#role: bar
|
#role: bar
|
||||||
|
|
||||||
@ -70,18 +62,16 @@ target:
|
|||||||
#match_func: module.function
|
#match_func: module.function
|
||||||
|
|
||||||
# If this is set to `True` and no matching tags are found on a page, an exception is raised.
|
# If this is set to `True` and no matching tags are found on a page, an exception is raised.
|
||||||
# Default:
|
required: False
|
||||||
#required: False
|
|
||||||
|
|
||||||
# Stop doing requests as soon as possible when this number of matches were extracted.
|
# Stop doing requests as soon as possible when this number of matches were extracted.
|
||||||
# Note that setting this parameter will restrict the number of returned targets to no more than is set here,
|
# Note that setting this parameter will restrict the number of returned targets to no more than is set here,
|
||||||
# but in asynchronous execution the total number of requests made and targets scraped may be higher.
|
# but in asynchronous execution the total number of requests made and targets scraped may be higher.
|
||||||
# Example:
|
# Example:
|
||||||
limit: 20
|
#limit: 20
|
||||||
|
|
||||||
# If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted.
|
# If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted.
|
||||||
# Default:
|
extract_text: True
|
||||||
#extract_text: True
|
|
||||||
|
|
||||||
# Optional transformation function to apply to every matching target tag.
|
# Optional transformation function to apply to every matching target tag.
|
||||||
# Should take a BS4 Tag object as its sole argument and return a string.
|
# Should take a BS4 Tag object as its sole argument and return a string.
|
||||||
@ -102,7 +92,7 @@ next_link:
|
|||||||
|
|
||||||
# Filter by the `href` attribute of the anchor tag.
|
# Filter by the `href` attribute of the anchor tag.
|
||||||
# Example:
|
# Example:
|
||||||
href: '^/wiki/\w+'
|
#href: '^/wiki/\w+'
|
||||||
|
|
||||||
# Filter by any other valid HTML attributes
|
# Filter by any other valid HTML attributes
|
||||||
# Example:
|
# Example:
|
||||||
@ -116,9 +106,8 @@ next_link:
|
|||||||
|
|
||||||
# Get at most this many links to other pages from one page.
|
# Get at most this many links to other pages from one page.
|
||||||
# Example:
|
# Example:
|
||||||
limit: 10
|
#limit: 10
|
||||||
|
|
||||||
# If `True`, and a limit is set that is below the number of matches on one page,
|
# If `True`, and a limit is set that is below the number of matches on one page,
|
||||||
# the links are chosen at random. Otherwise the first `limit` number are chosen.
|
# the links are chosen at random. Otherwise the first `limit` number are chosen.
|
||||||
# Default:
|
random: True
|
||||||
#random: True
|
|
@ -4,7 +4,7 @@ from distutils.util import strtobool
|
|||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from soupjobs import CONFIG_FILE_ENV_VAR
|
from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_DEFAULT_NAME
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
@ -17,8 +17,8 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-c', '--config-file',
|
'-c', '--config-file',
|
||||||
type=Path,
|
type=Path,
|
||||||
default=Path(Path(__file__).parent, 'config.yaml'),
|
default=Path(Path.cwd(), CONFIG_FILE_DEFAULT_NAME),
|
||||||
help="Specify a different config file. Defaults to 'config.yaml' in the same directory as this run script."
|
help="Specify a different config file path. Defaults to 'config.yaml' in the current working directory."
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'entry_urls',
|
'entry_urls',
|
||||||
@ -64,7 +64,7 @@ if __name__ == '__main__':
|
|||||||
if not settings.has_target_filters():
|
if not settings.has_target_filters():
|
||||||
warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \
|
warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \
|
||||||
"Are you sure you want to proceed? (y/n)"
|
"Are you sure you want to proceed? (y/n)"
|
||||||
proceed = strtobool(input(warning))
|
proceed = strtobool(input(warning).lower())
|
||||||
if not proceed:
|
if not proceed:
|
||||||
print("Cancelled")
|
print("Cancelled")
|
||||||
exit()
|
exit()
|
||||||
|
@ -10,7 +10,7 @@ from bs4 import BeautifulSoup
|
|||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from soupjobs.settings import settings, OutputFormat, TagAttrs
|
from .settings import settings, OutputFormat, TagAttrs
|
||||||
|
|
||||||
|
|
||||||
BS_PARSER = 'html.parser'
|
BS_PARSER = 'html.parser'
|
||||||
|
@ -6,7 +6,7 @@ import os
|
|||||||
from pydantic import BaseSettings, PyObject
|
from pydantic import BaseSettings, PyObject
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
|
from soupjobs import SOUPJOBS, CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
|
||||||
|
|
||||||
|
|
||||||
TagAttrs = Dict[str, str]
|
TagAttrs = Dict[str, str]
|
||||||
@ -19,7 +19,7 @@ class OutputFormat(str, Enum):
|
|||||||
|
|
||||||
|
|
||||||
class Settings(BaseSettings):
|
class Settings(BaseSettings):
|
||||||
_CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
|
CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
|
||||||
|
|
||||||
entry_urls: Union[str, List[str]] = []
|
entry_urls: Union[str, List[str]] = []
|
||||||
regex_mode: bool = False
|
regex_mode: bool = False
|
||||||
@ -49,10 +49,19 @@ class Settings(BaseSettings):
|
|||||||
def has_target_filters(self) -> bool:
|
def has_target_filters(self) -> bool:
|
||||||
return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
|
return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
|
||||||
|
|
||||||
@staticmethod
|
class Config:
|
||||||
def yaml_settings(_settings_obj: BaseSettings) -> Dict[str, Any]:
|
validate_assignment = True
|
||||||
|
env_prefix = SOUPJOBS + '_'
|
||||||
|
env_file_encoding = 'utf-8'
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def customise_sources(cls, init_settings, env_settings, file_secret_settings):
|
||||||
|
return init_settings, env_settings, file_secret_settings, yaml_settings
|
||||||
|
|
||||||
|
|
||||||
|
def yaml_settings(settings_obj: Settings) -> Dict[str, Any]:
|
||||||
try:
|
try:
|
||||||
with open(Settings._CONFIG_FILE, 'r') as f:
|
with open(settings_obj.CONFIG_FILE, 'r') as f:
|
||||||
config = yaml.safe_load(f)
|
config = yaml.safe_load(f)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
return {}
|
return {}
|
||||||
@ -62,13 +71,5 @@ class Settings(BaseSettings):
|
|||||||
config.setdefault(f'{section_name}_{key}', value)
|
config.setdefault(f'{section_name}_{key}', value)
|
||||||
return config
|
return config
|
||||||
|
|
||||||
class Config:
|
|
||||||
validate_assignment = True
|
|
||||||
env_file_encoding = 'utf-8'
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def customise_sources(cls, init_settings, env_settings, file_secret_settings):
|
|
||||||
return init_settings, env_settings, file_secret_settings, Settings.yaml_settings
|
|
||||||
|
|
||||||
|
|
||||||
settings = Settings()
|
settings = Settings()
|
||||||
|
Loading…
Reference in New Issue
Block a user