############################ # Scrape job configuration # ############################ # All possible config parameters are explained here with either the default or an example value provided with them. # General section ################# # If specified as a list, the elements are interpreted as urls to visit and scrape; # if specified as a string that has valid url format, the corresponding page is visited and scraped; # otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped. entry_urls: # If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as # regular expressions to match the attribute values, otherwise they are simply checked for string equality. regex_mode: False # The maximum recursion depth for following matching links to other pages starting from the `entry_urls`. # For example, a `max_depth` of 2 means that for every entry page the matching links may be followed, # and then for every one of those, the matching links may be followed again, but then the job would end. max_depth: 0 # Maximum number of pages to visit. # Example: #max_pages: 100 # If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values; # otherwise the output is merely a list of all extracted matches. output_with_urls: False # Output can be produced in either 'yaml', 'json' or 'simple' format. # The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and # add the url of the page they were extracted from in an additional line before them if that setting is `True`. output_format: simple # Target section # ############## # The following section is used to specify filter criteria for the target HTML tags to extract from a page. # Only HTML tags matching *all* of the specified filters will be extracted. target: # Filter by HTML tag # Example to only look for

tags: #tag: h1 # Filter by text inside the tag # Example: #text: program # Filter by any valid HTML attributes # Example: #attrs: #id: firstHeading #class: # - foo # - example #role: bar # Filter using a custom python function with the path specified in dot-notation. # The function should take a BS4 Tag object as its sole argument and return `True` if the tag matches. # Example: #match_func: module.function # If this is set to `True` and no matching tags are found on a page, an exception is raised. required: False # Stop doing requests as soon as possible when this number of matches were extracted. # Note that setting this parameter will restrict the number of returned targets to no more than is set here, # but in asynchronous execution the total number of requests made and targets scraped may be higher. # Example: #limit: 20 # If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted. extract_text: True # Optional transformation function to apply to every matching target tag. # If `extract_text` is `True`, it should take a string as its sole argument and return a string, # otherwise it should take a BS4 Tag object as its sole argument and return a string. # Example: #transform: module.function # Links section ############### # This section is used to specify filter criteria for links ( tags) to pick for recursive scraping. # Only HTML tags matching *all* of the specified filters will be considered. # The linked pages will be recursively scraped at most to the depth specified by the `max_depth` parameter. next_link: # Filter by text inside the anchor tag # Example: #text: Another important page # Filter by the `href` attribute of the anchor tag. # Example: #href: '^/wiki/\w+' # Filter by any other valid HTML attributes # Example: #attrs: #class: result #rel: noopener # Function filter; same as in the `target` section. # Example: #match_func: module.function # Get at most this many links to other pages from one page. # Example: #limit: 10 # If `True`, and a limit is set that is below the number of matches on one page, # the links are chosen at random. Otherwise the first `limit` number are chosen. random: True