soupjobs/src/soupjobs/default.config.yaml

117 lines
4.2 KiB
YAML

############################
# Scrape job configuration #
############################
# All possible config parameters are explained here with either the default or an example value provided with them.
# General section
#################
# If specified as a list, the elements are interpreted as urls to visit and scrape;
# if specified as a string that has valid url format, the corresponding page is visited and scraped;
# otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped.
entry_urls:
# If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as
# regular expressions to match the attribute values, otherwise they are simply checked for string equality.
regex_mode: False
# The maximum recursion depth for following matching links to other pages starting from the `entry_urls`.
# For example, a `max_depth` of 2 means that for every entry page the matching links may be followed,
# and then for every one of those, the matching links may be followed again, but then the job would end.
max_depth: 0
# Maximum number of pages to visit.
# Example:
#max_pages: 100
# If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values;
# otherwise the output is merely a list of all extracted matches.
output_with_urls: False
# Output can be produced in either 'yaml', 'json' or 'simple' format.
# The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and
# add the url of the page they were extracted from in an additional line before them if that setting is `True`.
output_format: simple
# Target section
# ##############
# The following section is used to specify filter criteria for the target HTML tags to extract from a page.
# Only HTML tags matching *all* of the specified filters will be extracted.
target:
# Filter by HTML tag
# Example to only look for <h1> tags:
#tag: h1
# Filter by text inside the tag
# Example:
#text: program
# Filter by any valid HTML attributes
# Example:
#attrs:
#id: firstHeading
#class:
# - foo
# - example
#role: bar
# Filter using a custom python function with the path specified in dot-notation.
# The function should take a BS4 Tag object as its sole argument and return `True` if the tag matches.
# Example:
#match_func: module.function
# If this is set to `True` and no matching tags are found on a page, an exception is raised.
required: False
# Stop doing requests as soon as possible when this number of matches were extracted.
# Note that setting this parameter will restrict the number of returned targets to no more than is set here,
# but in asynchronous execution the total number of requests made and targets scraped may be higher.
# Example:
#limit: 20
# If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted.
extract_text: True
# Optional transformation function to apply to every matching target tag.
# If `extract_text` is `True`, it should take a string as its sole argument and return a string,
# otherwise it should take a BS4 Tag object as its sole argument and return a string.
# Example:
#transform: module.function
# Links section
###############
# This section is used to specify filter criteria for links (<a> tags) to pick for recursive scraping.
# Only HTML tags matching *all* of the specified filters will be considered.
# The linked pages will be recursively scraped at most to the depth specified by the `max_depth` parameter.
next_link:
# Filter by text inside the anchor tag
# Example:
#text: Another important page
# Filter by the `href` attribute of the anchor tag.
# Example:
#href: '^/wiki/\w+'
# Filter by any other valid HTML attributes
# Example:
#attrs:
#class: result
#rel: noopener
# Function filter; same as in the `target` section.
# Example:
#match_func: module.function
# Get at most this many links to other pages from one page.
# Example:
#limit: 10
# If `True`, and a limit is set that is below the number of matches on one page,
# the links are chosen at random. Otherwise the first `limit` number are chosen.
random: True