soupjobs/src/soupjobs/default.config.yaml

############################
# Scrape job configuration #
############################

# All possible config parameters are explained here with either the default or an example value provided with them.

# General section
#################

# If specified as a list, the elements are interpreted as urls to visit and scrape;
# if specified as a string that has valid url format, the corresponding page is visited and scraped;
# otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped.
entry_urls:

# If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as
# regular expressions to match the attribute values, otherwise they are simply checked for string equality.
regex_mode: False

# The maximum recursion depth for following matching links to other pages starting from the `entry_urls`.
# For example, a `max_depth` of 2 means that for every entry page the matching links may be followed,
# and then for every one of those, the matching links may be followed again, but then the job would end.
max_depth: 0

# Maximum number of pages to visit.
# Example:
#max_pages: 100

# If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values;
# otherwise the output is merely a list of all extracted matches.
output_with_urls: False

# Output can be produced in either 'yaml', 'json' or 'simple' format.
# The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and
# add the url of the page they were extracted from in an additional line before them if that setting is `True`.
output_format: simple


# Target section
# ##############

# The following section is used to specify filter criteria for the target HTML tags to extract from a page.
# Only HTML tags matching *all* of the specified filters will be extracted.
target:
  # Filter by HTML tag
  # Example to only look for <h1> tags:
  #tag: h1

  # Filter by text inside the tag
  # Example:
  #text: program

  # Filter by any valid HTML attributes
  # Example:
  #attrs:
    #id: firstHeading
    #class:
    #  - foo
    #  - example
    #role: bar

  # Filter using a custom python function with the path specified in dot-notation.
  # The function should take a BS4 Tag object as its sole argument and return `True` if the tag matches.
  # Example:
  #match_func: module.function

  # If this is set to `True` and no matching tags are found on a page, an exception is raised.
  required: False

  # Stop doing requests as soon as possible when this number of matches were extracted.
  # Note that setting this parameter will restrict the number of returned targets to no more than is set here,
  # but in asynchronous execution the total number of requests made and targets scraped may be higher.
  # Example:
  #limit: 20

  # If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted.
  extract_text: True

  # Optional transformation function to apply to every matching target tag.
  # If `extract_text` is `True`, it should take a string as its sole argument and return a string,
  # otherwise it should take a BS4 Tag object as its sole argument and return a string.
  # Example:
  #transform: module.function


# Links section
###############

# This section is used to specify filter criteria for links (<a> tags) to pick for recursive scraping.
# Only HTML tags matching *all* of the specified filters will be considered.
# The linked pages will be recursively scraped at most to the depth specified by the `max_depth` parameter.
next_link:
  # Filter by text inside the anchor tag
  # Example:
  #text: Another important page

  # Filter by the `href` attribute of the anchor tag.
  # Example:
  #href: '^/wiki/\w+'

  # Filter by any other valid HTML attributes
  # Example:
  #attrs:
    #class: result
    #rel: noopener

  # Function filter; same as in the `target` section.
  # Example:
  #match_func: module.function

  # Get at most this many links to other pages from one page.
  # Example:
  #limit: 10

  # If `True`, and a limit is set that is below the number of matches on one page,
  # the links are chosen at random. Otherwise the first `limit` number are chosen.
  random: True