diff --git a/.gitignore b/.gitignore index fe4ca33..605b04b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +config.yaml # Python virtual environment: /.venv/ # pipenv lock-file and cache directory: diff --git a/Pipfile b/Pipfile index f2797b1..55ad154 100644 --- a/Pipfile +++ b/Pipfile @@ -6,6 +6,8 @@ name = "pypi" [packages] beautifulsoup4 = "*" aiohttp = "*" +pydantic = "*" +pyyaml = "*" [dev-packages] diff --git a/setup.py b/setup.py index 1414157..67fde8e 100644 --- a/setup.py +++ b/setup.py @@ -14,9 +14,12 @@ setuptools.setup( # url="https://github.com/...", package_dir={'': 'src'}, packages=setuptools.find_packages(where='src'), + package_data={'soupjobs': ['example.config.yaml']}, install_requires=[ 'aiohttp', - 'beautifulsoup4' + 'beautifulsoup4', + 'pydantic', + 'pyyaml', ], extras_require={ 'tests': ['coverage'], diff --git a/src/soupjobs/__init__.py b/src/soupjobs/__init__.py new file mode 100644 index 0000000..9c9ee07 --- /dev/null +++ b/src/soupjobs/__init__.py @@ -0,0 +1,2 @@ +CONFIG_FILE_ENV_VAR = 'SOUPJOBS_CONFIG' +CONFIG_FILE_PLACEHOLDER = 'placeholder' diff --git a/src/soupjobs/example.config.yaml b/src/soupjobs/example.config.yaml new file mode 100644 index 0000000..deb3774 --- /dev/null +++ b/src/soupjobs/example.config.yaml @@ -0,0 +1,124 @@ +############################ +# Scrape job configuration # +############################ + +# All possible config parameters are explained here with either the default or an example value provided with them. + +# General section +################# + +# If specified as a list, the elements are interpreted as urls to visit and scrape; +# if specified as a string that has valid url format, the corresponding page is visited and scraped; +# otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped. +entry_urls: + - https://en.wikipedia.org/wiki/Python_(programming_language) + - https://en.wikipedia.org/wiki/Guido_van_Rossum + +# If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as +# regular expressions to match the attribute values, otherwise they are simply checked for string equality. +# Default: +#regex_mode: False +regex_mode: True + +# The maximum recursion depth for following matching links to other pages starting from the `entry_urls`. +# For example, a `max_depth` of 2 means that for every entry page the matching links may be followed, +# and then for every one of those, the matching links may be followed again, but then the job would end. +# Default: +#max_depth: 0 +max_depth: 2 + +# Maximum number of pages to visit. +# Example: +#max_pages: 100 + +# If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values; +# otherwise the output is merely a list of all extracted matches. +# Default: +#output_with_urls: False + +# Output can be produced in either 'yaml', 'json' or 'simple' format. +# The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and +# add the url of the page they were extracted from in an additional line before them if that setting is `True`. +# Default: +#output_format: simple + + +# Target section +# ############## + +# The following section is used to specify filter criteria for the target HTML tags to extract from a page. +# Only HTML tags matching *all* of the specified filters will be extracted. +target: + # Filter by HTML tag + # Example to only look for

tags: + tag: h1 + + # Filter by text inside the tag + # Example: + #text: program + + # Filter by any valid HTML attributes + attrs: + # Examples: + id: firstHeading + #class: foo + #role: bar + + # Filter using a custom python function with the path specified in dot-notation. + # The function should take a BS4 Tag object as its sole argument and return `True` if the tag matches. + # Example: + #match_func: module.function + + # If this is set to `True` and no matching tags are found on a page, an exception is raised. + # Default: + #required: False + + # Stop doing requests as soon as possible when this number of matches were extracted. + # Note that setting this parameter will restrict the number of returned targets to no more than is set here, + # but in asynchronous execution the total number of requests made and targets scraped may be higher. + # Example: + limit: 20 + + # If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted. + # Default: + #extract_text: True + + # Optional transformation function to apply to every matching target tag. + # Should take a BS4 Tag object as its sole argument and return a string. + # Example: + #transform: module.function + + +# Links section +############### + +# This section is used to specify filter criteria for links ( tags) to pick for recursive scraping. +# Only HTML tags matching *all* of the specified filters will be considered. +# The linked pages will be recursively scraped at most to the depth specified by the `max_depth` parameter. +next_link: + # Filter by text inside the anchor tag + # Example: + #text: Another important page + + # Filter by the `href` attribute of the anchor tag. + # Example: + href: '^/wiki/\w+' + + # Filter by any other valid HTML attributes + # Example: + #attrs: + #class: result + #rel: noopener + + # Function filter; same as in the `target` section. + # Example: + #match_func: module.function + + # Get at most this many links to other pages from one page. + # Example: + limit: 10 + + # If `True`, and a limit is set that is below the number of matches on one page, + # the links are chosen at random. Otherwise the first `limit` number are chosen. + # Default: + #random: True diff --git a/src/soupjobs/run.py b/src/soupjobs/run.py new file mode 100644 index 0000000..ebc9100 --- /dev/null +++ b/src/soupjobs/run.py @@ -0,0 +1,71 @@ +from argparse import ArgumentParser, SUPPRESS +from pathlib import Path +from distutils.util import strtobool +import asyncio +import os + +from soupjobs import CONFIG_FILE_ENV_VAR + + +if __name__ == '__main__': + parser = ArgumentParser( + argument_default=SUPPRESS, + description="CLI tool for starting scraping jobs. " + "The provided settings always override environment variables and those in the config file. " + "For more detailed descriptions of options refer to the example config file." + ) + parser.add_argument( + '-c', '--config-file', + type=Path, + default=Path(Path(__file__).parent, 'config.yaml'), + help="Specify a different config file. Defaults to 'config.yaml' in the same directory as this run script." + ) + parser.add_argument( + 'entry_urls', + type=str, + nargs='*', + help="Can be a url, list of urls or path to a text file with urls" + ) + parser.add_argument( + '-r', '--regex-mode', + action='store_true', + help="Set this flag to treat all filter strings as regular expressions" + ) + parser.add_argument( + '-d', '--max-depth', + type=int, + help="Maximum recursion depth for following matching links to other pages" + ) + parser.add_argument( + '-p', '--max-pages', + type=int, + help="Maximum number of pages to visit" + ) + parser.add_argument( + '--output-with-urls', + action='store_true', + help="Set this flag to map scraped results to the url of the page they were found on" + ) + parser.add_argument( + '-o', '--output-format', + type=str, + help="Set to either 'yaml', 'json' or 'simple' format" + ) + kwargs = vars(parser.parse_args()) + os.environ[CONFIG_FILE_ENV_VAR] = str(kwargs.pop('config_file')) + from soupjobs.settings import settings + from soupjobs.scrape import Job + for key, value in kwargs.items(): + print(key, value, type(value)) + setattr(settings, key, value) + if not settings.entry_urls: + print("No urls specified") + exit() + if not settings.has_target_filters(): + warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \ + "Are you sure you want to proceed? (y/n)" + proceed = strtobool(input(warning)) + if not proceed: + print("Cancelled") + exit() + asyncio.run(Job().start()) diff --git a/src/soupjobs/scrape.py b/src/soupjobs/scrape.py new file mode 100644 index 0000000..4a1d705 --- /dev/null +++ b/src/soupjobs/scrape.py @@ -0,0 +1,209 @@ +from typing import Tuple, List, Dict, Callable, Any +from urllib.parse import urlparse, urljoin +import asyncio +import random +import re +import json + +from aiohttp import ClientSession +from bs4 import BeautifulSoup +from bs4.element import Tag +import yaml + +from soupjobs.settings import settings, OutputFormat, TagAttrs + + +BS_PARSER = 'html.parser' + + +class Job: + def __init__(self, session: ClientSession = None): + self.loop = None + self.session = session + self.lock = None + self.results: Dict[str, List[Any]] = {} + self.num_targets_found: int = 0 + self.num_links_followed: int = 0 + self.page_counter: int = 0 + self.entry_urls: List[str] = [] + urls = [settings.entry_urls] if isinstance(settings.entry_urls, str) else settings.entry_urls + for url in urls: + if is_valid_url(url): + self.entry_urls.append(url) + else: + with open(url, 'r') as f: + self.entry_urls += f.readlines() + + async def init_async(self) -> None: + self.loop = asyncio.get_event_loop() + self.session = ClientSession(loop=self.loop) + self.lock = asyncio.Lock() + + async def start(self) -> None: + await self.init_async() + print_start() + try: + await self.run(*self.entry_urls) + finally: + await self.session.close() + print_end() + + async def run(self, *urls: str, depth: int = 0) -> None: + async with self.lock: + urls = list(set(urls).difference(*self.results.keys())) + if settings.max_pages: + num_requests_left = settings.max_pages - self.num_links_followed + if num_requests_left <= 0: + return + urls = urls[:num_requests_left] + self.num_links_followed += len(urls) + output = await asyncio.gather(*(self.get_and_scrape(url) for url in urls), loop=self.loop) + assert isinstance(output, list) + next_links = construct_next_urls(urls, output) + if depth < settings.max_depth: + await self.run(*next_links, depth=depth + 1) + + async def get_and_scrape(self, url: str) -> List[str]: + async with self.lock: + if settings.target_limit and self.num_targets_found >= settings.target_limit: + return [] + async with self.session.get(url) as response: + html = await response.text() + targets, links = scrape_document(html) + async with self.lock: + self.page_counter += 1 + num_targets_left = settings.target_limit - self.num_targets_found if settings.target_limit else None + targets = targets[:num_targets_left] + self.results[url] = targets + self.num_targets_found += len(targets) + print_page_results(url, targets, self.limit_reached()) + return links + + def limit_reached(self) -> bool: + if settings.max_pages and self.page_counter >= settings.max_pages: + return True + if settings.target_limit and self.num_targets_found >= settings.target_limit: + return True + return False + + +def scrape_document(html: str) -> Tuple[List[Any], List[str]]: + soup = BeautifulSoup(html, BS_PARSER) + # Targets: + targets = [] + for tag in soup.find_all(target_filter, limit=settings.target_limit): + target = tag.text if settings.target_extract_text else str(tag) + if settings.target_transform: + target = settings.target_transform(target) + targets.append(target) + # Next links: + if settings.next_link_random: + links = soup.find_all(link_filter) + if settings.next_link_limit and settings.next_link_limit < len(links): + indices = list(range(len(links))) + random.shuffle(indices) + links = links[:settings.next_link_limit] + else: + links = soup.find_all(link_filter, limit=settings.next_link_limit) + return targets, [a['href'] for a in links] + + +def link_filter(tag: Tag) -> bool: + try: + if not string_matches(tag['href'], settings.next_link_href, settings.regex_mode): + return False + except KeyError: + return False + return tag_filter(tag, 'a', text=settings.next_link_text, attrs=settings.next_link_attrs, + func=settings.next_link_match_func, regex=settings.regex_mode) + + +def target_filter(tag: Tag) -> bool: + return tag_filter(tag, settings.target_tag, text=settings.target_text, attrs=settings.target_attrs, + func=settings.target_match_func, regex=settings.regex_mode) + + +def tag_filter(tag: Tag, name: str = None, text: str = None, attrs: TagAttrs = None, func: Callable[[Tag], bool] = None, + regex: bool = False) -> bool: + """ + Returns `True` only if the `tag` matches all provided filter criteria. + Built to be used in the `find_all` method from BeautifulSoup4. + + Args: + tag: + The BS4 Tag object to check + name (optional): + What kind of tag will be matched (e.g. 'a' would match an HTML anchor tag) + text (optional): + The text enclosed by the tag to be matched + func (optional): + Function to run on the tag for filtering (should return `True` if the tag matches) + attrs (optional): + Any additional attributes the tag should match, e.g. {'class': 'relevant'} + regex (optional): + If `True`, all checks are performed by matching the tag's attributes using the provided arguments as + a regular expression, otherwise they are checked for string equality. + """ + if not string_matches(tag.name, name, regex): + return False + if not string_matches(tag.text, text, regex): + return False + for attr_name, attr_value in attrs.items(): + try: + if not string_matches(tag[attr_name], attr_value, regex): + return False + except KeyError: + return False + if func: + return func(tag) + return True + + +def string_matches(search_string: str, expression: str = None, regex: bool = False) -> bool: + if expression is None: + return True + if not regex: + return search_string == expression + return re.compile(expression).search(search_string) is not None + + +def construct_next_urls(urls: List[str], next_links_lists: List[List[str]]) -> List[str]: + output = set() + for url, next_links in zip(urls, next_links_lists): + for link in next_links: + output.add(urljoin(url, link)) + return list(output) + + +def is_valid_url(string: str) -> bool: + parsed = urlparse(string) + if not all([parsed.scheme, parsed.netloc]): + return False + if parsed.scheme not in ('http', 'https'): + return False + return True + + +def print_start() -> None: + if settings.output_format == OutputFormat.json: + print('{') if settings.output_with_urls else print('[') + + +def print_end() -> None: + if settings.output_format == OutputFormat.json: + print('}') if settings.output_with_urls else print(']') + + +def print_page_results(url: str, targets: List[str], last: bool = False) -> None: + if settings.output_format == OutputFormat.yaml: + output = yaml.safe_dump({url: targets} if settings.output_with_urls else targets) + end = '' + elif settings.output_format == OutputFormat.json: + output = json.dumps({url: targets} if settings.output_with_urls else targets, indent=2)[2:-2] + end = '\n' if last else ',\n' + else: + output = '\n'.join(targets) + if settings.output_with_urls: + output = url + ':\n' + output + end = '\n' + print(output, end=end) diff --git a/src/soupjobs/settings.py b/src/soupjobs/settings.py new file mode 100644 index 0000000..8699fe6 --- /dev/null +++ b/src/soupjobs/settings.py @@ -0,0 +1,74 @@ +from typing import Dict, List, Union, Optional, Any +from pathlib import Path +from enum import Enum +import os + +from pydantic import BaseSettings, PyObject +import yaml + +from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER + + +TagAttrs = Dict[str, str] + + +class OutputFormat(str, Enum): + simple = 'simple' + yaml = 'yaml' + json = 'json' + + +class Settings(BaseSettings): + _CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER) + + entry_urls: Union[str, List[str]] = [] + regex_mode: bool = False + max_depth: int = 0 + max_pages: Optional[int] + output_with_urls: bool = False + output_format: OutputFormat = OutputFormat.simple + + # Target to extract: + target_tag: Optional[str] + target_text: Optional[str] + target_attrs: TagAttrs = {} + target_match_func: Optional[PyObject] + target_required: bool = False + target_limit: Optional[int] + target_extract_text: bool = True + target_transform: Optional[PyObject] + + # Link to next page: + next_link_text: Optional[str] + next_link_href: Optional[str] + next_link_attrs: TagAttrs = {} + next_link_match_func: Optional[PyObject] + next_link_limit: Optional[int] + next_link_random: bool = True + + def has_target_filters(self) -> bool: + return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func]) + + @staticmethod + def yaml_settings(_settings_obj: BaseSettings) -> Dict[str, Any]: + try: + with open(Settings._CONFIG_FILE, 'r') as f: + config = yaml.safe_load(f) + except FileNotFoundError: + return {} + for section_name in ('target', 'next_link'): + section = config.pop(section_name, {}) + for key, value in section.items(): + config.setdefault(f'{section_name}_{key}', value) + return config + + class Config: + validate_assignment = True + env_file_encoding = 'utf-8' + + @classmethod + def customise_sources(cls, init_settings, env_settings, file_secret_settings): + return init_settings, env_settings, file_secret_settings, Settings.yaml_settings + + +settings = Settings()