117 lines
4.2 KiB
YAML
117 lines
4.2 KiB
YAML
############################
|
|
# Scrape job configuration #
|
|
############################
|
|
|
|
# All possible config parameters are explained here with either the default or an example value provided with them.
|
|
|
|
# General section
|
|
#################
|
|
|
|
# If specified as a list, the elements are interpreted as urls to visit and scrape;
|
|
# if specified as a string that has valid url format, the corresponding page is visited and scraped;
|
|
# otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped.
|
|
entry_urls:
|
|
|
|
# If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as
|
|
# regular expressions to match the attribute values, otherwise they are simply checked for string equality.
|
|
regex_mode: False
|
|
|
|
# The maximum recursion depth for following matching links to other pages starting from the `entry_urls`.
|
|
# For example, a `max_depth` of 2 means that for every entry page the matching links may be followed,
|
|
# and then for every one of those, the matching links may be followed again, but then the job would end.
|
|
max_depth: 0
|
|
|
|
# Maximum number of pages to visit.
|
|
# Example:
|
|
#max_pages: 100
|
|
|
|
# If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values;
|
|
# otherwise the output is merely a list of all extracted matches.
|
|
output_with_urls: False
|
|
|
|
# Output can be produced in either 'yaml', 'json' or 'simple' format.
|
|
# The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and
|
|
# add the url of the page they were extracted from in an additional line before them if that setting is `True`.
|
|
output_format: simple
|
|
|
|
|
|
# Target section
|
|
# ##############
|
|
|
|
# The following section is used to specify filter criteria for the target HTML tags to extract from a page.
|
|
# Only HTML tags matching *all* of the specified filters will be extracted.
|
|
target:
|
|
# Filter by HTML tag
|
|
# Example to only look for <h1> tags:
|
|
#tag: h1
|
|
|
|
# Filter by text inside the tag
|
|
# Example:
|
|
#text: program
|
|
|
|
# Filter by any valid HTML attributes
|
|
# Example:
|
|
#attrs:
|
|
#id: firstHeading
|
|
#class:
|
|
# - foo
|
|
# - example
|
|
#role: bar
|
|
|
|
# Filter using a custom python function with the path specified in dot-notation.
|
|
# The function should take a BS4 Tag object as its sole argument and return `True` if the tag matches.
|
|
# Example:
|
|
#match_func: module.function
|
|
|
|
# If this is set to `True` and no matching tags are found on a page, an exception is raised.
|
|
required: False
|
|
|
|
# Stop doing requests as soon as possible when this number of matches were extracted.
|
|
# Note that setting this parameter will restrict the number of returned targets to no more than is set here,
|
|
# but in asynchronous execution the total number of requests made and targets scraped may be higher.
|
|
# Example:
|
|
#limit: 20
|
|
|
|
# If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted.
|
|
extract_text: True
|
|
|
|
# Optional transformation function to apply to every matching target tag.
|
|
# If `extract_text` is `True`, it should take a string as its sole argument and return a string,
|
|
# otherwise it should take a BS4 Tag object as its sole argument and return a string.
|
|
# Example:
|
|
#transform: module.function
|
|
|
|
|
|
# Links section
|
|
###############
|
|
|
|
# This section is used to specify filter criteria for links (<a> tags) to pick for recursive scraping.
|
|
# Only HTML tags matching *all* of the specified filters will be considered.
|
|
# The linked pages will be recursively scraped at most to the depth specified by the `max_depth` parameter.
|
|
next_link:
|
|
# Filter by text inside the anchor tag
|
|
# Example:
|
|
#text: Another important page
|
|
|
|
# Filter by the `href` attribute of the anchor tag.
|
|
# Example:
|
|
#href: '^/wiki/\w+'
|
|
|
|
# Filter by any other valid HTML attributes
|
|
# Example:
|
|
#attrs:
|
|
#class: result
|
|
#rel: noopener
|
|
|
|
# Function filter; same as in the `target` section.
|
|
# Example:
|
|
#match_func: module.function
|
|
|
|
# Get at most this many links to other pages from one page.
|
|
# Example:
|
|
#limit: 10
|
|
|
|
# If `True`, and a limit is set that is below the number of matches on one page,
|
|
# the links are chosen at random. Otherwise the first `limit` number are chosen.
|
|
random: True
|