first working draft

2021-03-13 16:53:15 +01:00 · 2021-03-13 16:53:15 +01:00 · 65f8c7d4be
parent f58ed9fee0
commit 65f8c7d4be
8 changed files with 487 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
+config.yaml
 # Python virtual environment:
 /.venv/
 # pipenv lock-file and cache directory:
--- a/2
+++ b/2
@ -6,6 +6,8 @@ name = "pypi"
 [packages]
 beautifulsoup4 = "*"
 aiohttp = "*"
+pydantic = "*"
+pyyaml = "*"

 [dev-packages]

--- a/setup.py
+++ b/setup.py
@ -14,9 +14,12 @@ setuptools.setup(
    # url="https://github.com/...",
    package_dir={'': 'src'},
    packages=setuptools.find_packages(where='src'),
+    package_data={'soupjobs': ['example.config.yaml']},
    install_requires=[
        'aiohttp',
-        'beautifulsoup4'
+        'beautifulsoup4',
+        'pydantic',
+        'pyyaml',
    ],
    extras_require={
        'tests': ['coverage'],
--- a/src/soupjobs/init.py
+++ b/src/soupjobs/init.py
@ -0,0 +1,2 @@
+CONFIG_FILE_ENV_VAR = 'SOUPJOBS_CONFIG'
+CONFIG_FILE_PLACEHOLDER = 'placeholder'
--- a/src/soupjobs/example.config.yaml
+++ b/src/soupjobs/example.config.yaml
@ -0,0 +1,124 @@
+############################
+# Scrape job configuration #
+############################
+
+# All possible config parameters are explained here with either the default or an example value provided with them.
+
+# General section
+#################
+
+# If specified as a list, the elements are interpreted as urls to visit and scrape;
+# if specified as a string that has valid url format, the corresponding page is visited and scraped;
+# otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped.
+entry_urls:
+  - https://en.wikipedia.org/wiki/Python_(programming_language)
+  - https://en.wikipedia.org/wiki/Guido_van_Rossum
+
+# If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as
+# regular expressions to match the attribute values, otherwise they are simply checked for string equality.
+# Default:
+#regex_mode: False
+regex_mode: True
+
+# The maximum recursion depth for following matching links to other pages starting from the `entry_urls`.
+# For example, a `max_depth` of 2 means that for every entry page the matching links may be followed,
+# and then for every one of those, the matching links may be followed again, but then the job would end.
+# Default:
+#max_depth: 0
+max_depth: 2
+
+# Maximum number of pages to visit.
+# Example:
+#max_pages: 100
+
+# If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values;
+# otherwise the output is merely a list of all extracted matches.
+# Default:
+#output_with_urls: False
+
+# Output can be produced in either 'yaml', 'json' or 'simple' format.
+# The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and
+# add the url of the page they were extracted from in an additional line before them if that setting is `True`.
+# Default:
+#output_format: simple
+
+
+# Target section
+# ##############
+
+# The following section is used to specify filter criteria for the target HTML tags to extract from a page.
+# Only HTML tags matching *all* of the specified filters will be extracted.
+target:
+  # Filter by HTML tag
+  # Example to only look for <h1> tags:
+  tag: h1
+
+  # Filter by text inside the tag
+  # Example:
+  #text: program
+
+  # Filter by any valid HTML attributes
+  attrs:
+    # Examples:
+    id: firstHeading
+    #class: foo
+    #role: bar
+
+  # Filter using a custom python function with the path specified in dot-notation.
+  # The function should take a BS4 Tag object as its sole argument and return `True` if the tag matches.
+  # Example:
+  #match_func: module.function
+
+  # If this is set to `True` and no matching tags are found on a page, an exception is raised.
+  # Default:
+  #required: False
+
+  # Stop doing requests as soon as possible when this number of matches were extracted.
+  # Note that setting this parameter will restrict the number of returned targets to no more than is set here,
+  # but in asynchronous execution the total number of requests made and targets scraped may be higher.
+  # Example:
+  limit: 20
+
+  # If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted.
+  # Default:
+  #extract_text: True
+
+  # Optional transformation function to apply to every matching target tag.
+  # Should take a BS4 Tag object as its sole argument and return a string.
+  # Example:
+  #transform: module.function
+
+
+# Links section
+###############
+
+# This section is used to specify filter criteria for links (<a> tags) to pick for recursive scraping.
+# Only HTML tags matching *all* of the specified filters will be considered.
+# The linked pages will be recursively scraped at most to the depth specified by the `max_depth` parameter.
+next_link:
+  # Filter by text inside the anchor tag
+  # Example:
+  #text: Another important page
+
+  # Filter by the `href` attribute of the anchor tag.
+  # Example:
+  href: '^/wiki/\w+'
+
+  # Filter by any other valid HTML attributes
+  # Example:
+  #attrs:
+    #class: result
+    #rel: noopener
+
+  # Function filter; same as in the `target` section.
+  # Example:
+  #match_func: module.function
+
+  # Get at most this many links to other pages from one page.
+  # Example:
+  limit: 10
+
+  # If `True`, and a limit is set that is below the number of matches on one page,
+  # the links are chosen at random. Otherwise the first `limit` number are chosen.
+  # Default:
+  #random: True
--- a/src/soupjobs/run.py
+++ b/src/soupjobs/run.py
@ -0,0 +1,71 @@
+from argparse import ArgumentParser, SUPPRESS
+from pathlib import Path
+from distutils.util import strtobool
+import asyncio
+import os
+
+from soupjobs import CONFIG_FILE_ENV_VAR
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(
+        argument_default=SUPPRESS,
+        description="CLI tool for starting scraping jobs. "
+                    "The provided settings always override environment variables and those in the config file. "
+                    "For more detailed descriptions of options refer to the example config file."
+    )
+    parser.add_argument(
+        '-c', '--config-file',
+        type=Path,
+        default=Path(Path(__file__).parent, 'config.yaml'),
+        help="Specify a different config file. Defaults to 'config.yaml' in the same directory as this run script."
+    )
+    parser.add_argument(
+        'entry_urls',
+        type=str,
+        nargs='*',
+        help="Can be a url, list of urls or path to a text file with urls"
+    )
+    parser.add_argument(
+        '-r', '--regex-mode',
+        action='store_true',
+        help="Set this flag to treat all filter strings as regular expressions"
+    )
+    parser.add_argument(
+        '-d', '--max-depth',
+        type=int,
+        help="Maximum recursion depth for following matching links to other pages"
+    )
+    parser.add_argument(
+        '-p', '--max-pages',
+        type=int,
+        help="Maximum number of pages to visit"
+    )
+    parser.add_argument(
+        '--output-with-urls',
+        action='store_true',
+        help="Set this flag to map scraped results to the url of the page they were found on"
+    )
+    parser.add_argument(
+        '-o', '--output-format',
+        type=str,
+        help="Set to either 'yaml', 'json' or 'simple' format"
+    )
+    kwargs = vars(parser.parse_args())
+    os.environ[CONFIG_FILE_ENV_VAR] = str(kwargs.pop('config_file'))
+    from soupjobs.settings import settings
+    from soupjobs.scrape import Job
+    for key, value in kwargs.items():
+        print(key, value, type(value))
+        setattr(settings, key, value)
+    if not settings.entry_urls:
+        print("No urls specified")
+        exit()
+    if not settings.has_target_filters():
+        warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \
+                  "Are you sure you want to proceed? (y/n)"
+        proceed = strtobool(input(warning))
+        if not proceed:
+            print("Cancelled")
+            exit()
+    asyncio.run(Job().start())
--- a/src/soupjobs/scrape.py
+++ b/src/soupjobs/scrape.py
@ -0,0 +1,209 @@
+from typing import Tuple, List, Dict, Callable, Any
+from urllib.parse import urlparse, urljoin
+import asyncio
+import random
+import re
+import json
+
+from aiohttp import ClientSession
+from bs4 import BeautifulSoup
+from bs4.element import Tag
+import yaml
+
+from soupjobs.settings import settings, OutputFormat, TagAttrs
+
+
+BS_PARSER = 'html.parser'
+
+
+class Job:
+    def __init__(self, session: ClientSession = None):
+        self.loop = None
+        self.session = session
+        self.lock = None
+        self.results: Dict[str, List[Any]] = {}
+        self.num_targets_found: int = 0
+        self.num_links_followed: int = 0
+        self.page_counter: int = 0
+        self.entry_urls: List[str] = []
+        urls = [settings.entry_urls] if isinstance(settings.entry_urls, str) else settings.entry_urls
+        for url in urls:
+            if is_valid_url(url):
+                self.entry_urls.append(url)
+            else:
+                with open(url, 'r') as f:
+                    self.entry_urls += f.readlines()
+
+    async def init_async(self) -> None:
+        self.loop = asyncio.get_event_loop()
+        self.session = ClientSession(loop=self.loop)
+        self.lock = asyncio.Lock()
+
+    async def start(self) -> None:
+        await self.init_async()
+        print_start()
+        try:
+            await self.run(*self.entry_urls)
+        finally:
+            await self.session.close()
+            print_end()
+
+    async def run(self, *urls: str, depth: int = 0) -> None:
+        async with self.lock:
+            urls = list(set(urls).difference(*self.results.keys()))
+            if settings.max_pages:
+                num_requests_left = settings.max_pages - self.num_links_followed
+                if num_requests_left <= 0:
+                    return
+                urls = urls[:num_requests_left]
+                self.num_links_followed += len(urls)
+        output = await asyncio.gather(*(self.get_and_scrape(url) for url in urls), loop=self.loop)
+        assert isinstance(output, list)
+        next_links = construct_next_urls(urls, output)
+        if depth < settings.max_depth:
+            await self.run(*next_links, depth=depth + 1)
+
+    async def get_and_scrape(self, url: str) -> List[str]:
+        async with self.lock:
+            if settings.target_limit and self.num_targets_found >= settings.target_limit:
+                return []
+        async with self.session.get(url) as response:
+            html = await response.text()
+        targets, links = scrape_document(html)
+        async with self.lock:
+            self.page_counter += 1
+            num_targets_left = settings.target_limit - self.num_targets_found if settings.target_limit else None
+            targets = targets[:num_targets_left]
+            self.results[url] = targets
+            self.num_targets_found += len(targets)
+            print_page_results(url, targets, self.limit_reached())
+        return links
+
+    def limit_reached(self) -> bool:
+        if settings.max_pages and self.page_counter >= settings.max_pages:
+            return True
+        if settings.target_limit and self.num_targets_found >= settings.target_limit:
+            return True
+        return False
+
+
+def scrape_document(html: str) -> Tuple[List[Any], List[str]]:
+    soup = BeautifulSoup(html, BS_PARSER)
+    # Targets:
+    targets = []
+    for tag in soup.find_all(target_filter, limit=settings.target_limit):
+        target = tag.text if settings.target_extract_text else str(tag)
+        if settings.target_transform:
+            target = settings.target_transform(target)
+        targets.append(target)
+    # Next links:
+    if settings.next_link_random:
+        links = soup.find_all(link_filter)
+        if settings.next_link_limit and settings.next_link_limit < len(links):
+            indices = list(range(len(links)))
+            random.shuffle(indices)
+            links = links[:settings.next_link_limit]
+    else:
+        links = soup.find_all(link_filter, limit=settings.next_link_limit)
+    return targets, [a['href'] for a in links]
+
+
+def link_filter(tag: Tag) -> bool:
+    try:
+        if not string_matches(tag['href'], settings.next_link_href, settings.regex_mode):
+            return False
+    except KeyError:
+        return False
+    return tag_filter(tag, 'a', text=settings.next_link_text, attrs=settings.next_link_attrs,
+                      func=settings.next_link_match_func, regex=settings.regex_mode)
+
+
+def target_filter(tag: Tag) -> bool:
+    return tag_filter(tag, settings.target_tag, text=settings.target_text, attrs=settings.target_attrs,
+                      func=settings.target_match_func, regex=settings.regex_mode)
+
+
+def tag_filter(tag: Tag, name: str = None, text: str = None, attrs: TagAttrs = None, func: Callable[[Tag], bool] = None,
+               regex: bool = False) -> bool:
+    """
+    Returns `True` only if the `tag` matches all provided filter criteria.
+    Built to be used in the `find_all` method from BeautifulSoup4.
+
+    Args:
+        tag:
+            The BS4 Tag object to check
+        name (optional):
+            What kind of tag will be matched (e.g. 'a' would match an HTML anchor tag)
+        text (optional):
+            The text enclosed by the tag to be matched
+        func (optional):
+            Function to run on the tag for filtering (should return `True` if the tag matches)
+        attrs (optional):
+            Any additional attributes the tag should match, e.g. {'class': 'relevant'}
+        regex (optional):
+            If `True`, all checks are performed by matching the tag's attributes using the provided arguments as
+            a regular expression, otherwise they are checked for string equality.
+    """
+    if not string_matches(tag.name, name, regex):
+        return False
+    if not string_matches(tag.text, text, regex):
+        return False
+    for attr_name, attr_value in attrs.items():
+        try:
+            if not string_matches(tag[attr_name], attr_value, regex):
+                return False
+        except KeyError:
+            return False
+    if func:
+        return func(tag)
+    return True
+
+
+def string_matches(search_string: str, expression: str = None, regex: bool = False) -> bool:
+    if expression is None:
+        return True
+    if not regex:
+        return search_string == expression
+    return re.compile(expression).search(search_string) is not None
+
+
+def construct_next_urls(urls: List[str], next_links_lists: List[List[str]]) -> List[str]:
+    output = set()
+    for url, next_links in zip(urls, next_links_lists):
+        for link in next_links:
+            output.add(urljoin(url, link))
+    return list(output)
+
+
+def is_valid_url(string: str) -> bool:
+    parsed = urlparse(string)
+    if not all([parsed.scheme, parsed.netloc]):
+        return False
+    if parsed.scheme not in ('http', 'https'):
+        return False
+    return True
+
+
+def print_start() -> None:
+    if settings.output_format == OutputFormat.json:
+        print('{') if settings.output_with_urls else print('[')
+
+
+def print_end() -> None:
+    if settings.output_format == OutputFormat.json:
+        print('}') if settings.output_with_urls else print(']')
+
+
+def print_page_results(url: str, targets: List[str], last: bool = False) -> None:
+    if settings.output_format == OutputFormat.yaml:
+        output = yaml.safe_dump({url: targets} if settings.output_with_urls else targets)
+        end = ''
+    elif settings.output_format == OutputFormat.json:
+        output = json.dumps({url: targets} if settings.output_with_urls else targets, indent=2)[2:-2]
+        end = '\n' if last else ',\n'
+    else:
+        output = '\n'.join(targets)
+        if settings.output_with_urls:
+            output = url + ':\n' + output
+        end = '\n'
+    print(output, end=end)
--- a/src/soupjobs/settings.py
+++ b/src/soupjobs/settings.py
@ -0,0 +1,74 @@
+from typing import Dict, List, Union, Optional, Any
+from pathlib import Path
+from enum import Enum
+import os
+
+from pydantic import BaseSettings, PyObject
+import yaml
+
+from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
+
+
+TagAttrs = Dict[str, str]
+
+
+class OutputFormat(str, Enum):
+    simple = 'simple'
+    yaml = 'yaml'
+    json = 'json'
+
+
+class Settings(BaseSettings):
+    _CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
+
+    entry_urls: Union[str, List[str]] = []
+    regex_mode: bool = False
+    max_depth: int = 0
+    max_pages: Optional[int]
+    output_with_urls: bool = False
+    output_format: OutputFormat = OutputFormat.simple
+
+    # Target to extract:
+    target_tag: Optional[str]
+    target_text: Optional[str]
+    target_attrs: TagAttrs = {}
+    target_match_func: Optional[PyObject]
+    target_required: bool = False
+    target_limit: Optional[int]
+    target_extract_text: bool = True
+    target_transform: Optional[PyObject]
+
+    # Link to next page:
+    next_link_text: Optional[str]
+    next_link_href: Optional[str]
+    next_link_attrs: TagAttrs = {}
+    next_link_match_func: Optional[PyObject]
+    next_link_limit: Optional[int]
+    next_link_random: bool = True
+
+    def has_target_filters(self) -> bool:
+        return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
+
+    @staticmethod
+    def yaml_settings(_settings_obj: BaseSettings) -> Dict[str, Any]:
+        try:
+            with open(Settings._CONFIG_FILE, 'r') as f:
+                config = yaml.safe_load(f)
+        except FileNotFoundError:
+            return {}
+        for section_name in ('target', 'next_link'):
+            section = config.pop(section_name, {})
+            for key, value in section.items():
+                config.setdefault(f'{section_name}_{key}', value)
+        return config
+
+    class Config:
+        validate_assignment = True
+        env_file_encoding = 'utf-8'
+
+        @classmethod
+        def customise_sources(cls, init_settings, env_settings, file_secret_settings):
+            return init_settings, env_settings, file_secret_settings, Settings.yaml_settings
+
+
+settings = Settings()