############################ # Scrape job configuration # ############################ # All possible config parameters are explained here with either the default or an example value provided with them. # General section ################# # If specified as a list, the elements are interpreted as urls to visit and scrape; # if specified as a string that has valid url format, the corresponding page is visited and scraped; # otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped. entry_urls: # If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as # regular expressions to match the attribute values, otherwise they are simply checked for string equality. regex_mode: False # The maximum recursion depth for following matching links to other pages starting from the `entry_urls`. # For example, a `max_depth` of 2 means that for every entry page the matching links may be followed, # and then for every one of those, the matching links may be followed again, but then the job would end. max_depth: 0 # Maximum number of pages to visit. # Example: #max_pages: 100 # If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values; # otherwise the output is merely a list of all extracted matches. output_with_urls: False # Output can be produced in either 'yaml', 'json' or 'simple' format. # The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and # add the url of the page they were extracted from in an additional line before them if that setting is `True`. output_format: simple # Target section # ############## # The following section is used to specify filter criteria for the target HTML tags to extract from a page. # Only HTML tags matching *all* of the specified filters will be extracted. target: # Filter by HTML tag # Example to only look for