first working draft

2021-03-13 16:53:15 +01:00
parent f58ed9fee0
commit 65f8c7d4be
8 changed files with 487 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 config.yaml
 # Python virtual environment:
 /.venv/
 # pipenv lock-file and cache directory:
--- a/2
+++ b/2
@@ -6,6 +6,8 @@ name = "pypi"
 [packages]
 beautifulsoup4 = "*"
 aiohttp = "*"
 pydantic = "*"
 pyyaml = "*"
 [dev-packages]
--- a/setup.py
+++ b/setup.py
@@ -14,9 +14,12 @@ setuptools.setup(
    # url="https://github.com/...",
    package_dir={'': 'src'},
    packages=setuptools.find_packages(where='src'),
    package_data={'soupjobs': ['example.config.yaml']},
    install_requires=[
        'aiohttp',
-        'beautifulsoup4'
+        'beautifulsoup4',
        'pydantic',
        'pyyaml',
    ],
    extras_require={
        'tests': ['coverage'],
--- a/src/soupjobs/init.py
+++ b/src/soupjobs/init.py
@@ -0,0 +1,2 @@
 CONFIG_FILE_ENV_VAR = 'SOUPJOBS_CONFIG'
 CONFIG_FILE_PLACEHOLDER = 'placeholder'
--- a/src/soupjobs/example.config.yaml
+++ b/src/soupjobs/example.config.yaml
@@ -0,0 +1,124 @@
 ############################
 # Scrape job configuration #
 ############################
 # All possible config parameters are explained here with either the default or an example value provided with them.
 # General section
 #################
 # If specified as a list, the elements are interpreted as urls to visit and scrape;
 # if specified as a string that has valid url format, the corresponding page is visited and scraped;
 # otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped.
 entry_urls:
  - https://en.wikipedia.org/wiki/Python_(programming_language)
  - https://en.wikipedia.org/wiki/Guido_van_Rossum
 # If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as
 # regular expressions to match the attribute values, otherwise they are simply checked for string equality.
 # Default:
 #regex_mode: False
 regex_mode: True
 # The maximum recursion depth for following matching links to other pages starting from the `entry_urls`.
 # For example, a `max_depth` of 2 means that for every entry page the matching links may be followed,
 # and then for every one of those, the matching links may be followed again, but then the job would end.
 # Default:
 #max_depth: 0
 max_depth: 2
 # Maximum number of pages to visit.
 # Example:
 #max_pages: 100
 # If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values;
 # otherwise the output is merely a list of all extracted matches.
 # Default:
 #output_with_urls: False
 # Output can be produced in either 'yaml', 'json' or 'simple' format.
 # The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and
 # add the url of the page they were extracted from in an additional line before them if that setting is `True`.
 # Default:
 #output_format: simple
 # Target section
 # ##############
 # The following section is used to specify filter criteria for the target HTML tags to extract from a page.
 # Only HTML tags matching *all* of the specified filters will be extracted.
 target:
  # Filter by HTML tag
  # Example to only look for <h1> tags:
  tag: h1
  # Filter by text inside the tag
  # Example:
  #text: program
  # Filter by any valid HTML attributes
  attrs:
    # Examples:
    id: firstHeading
    #class: foo
    #role: bar
  # Filter using a custom python function with the path specified in dot-notation.
  # The function should take a BS4 Tag object as its sole argument and return `True` if the tag matches.
  # Example:
  #match_func: module.function
  # If this is set to `True` and no matching tags are found on a page, an exception is raised.
  # Default:
  #required: False
  # Stop doing requests as soon as possible when this number of matches were extracted.
  # Note that setting this parameter will restrict the number of returned targets to no more than is set here,
  # but in asynchronous execution the total number of requests made and targets scraped may be higher.
  # Example:
  limit: 20
  # If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted.
  # Default:
  #extract_text: True
  # Optional transformation function to apply to every matching target tag.
  # Should take a BS4 Tag object as its sole argument and return a string.
  # Example:
  #transform: module.function
 # Links section
 ###############
 # This section is used to specify filter criteria for links (<a> tags) to pick for recursive scraping.
 # Only HTML tags matching *all* of the specified filters will be considered.
 # The linked pages will be recursively scraped at most to the depth specified by the `max_depth` parameter.
 next_link:
  # Filter by text inside the anchor tag
  # Example:
  #text: Another important page
  # Filter by the `href` attribute of the anchor tag.
  # Example:
  href: '^/wiki/\w+'
  # Filter by any other valid HTML attributes
  # Example:
  #attrs:
    #class: result
    #rel: noopener
  # Function filter; same as in the `target` section.
  # Example:
  #match_func: module.function
  # Get at most this many links to other pages from one page.
  # Example:
  limit: 10
  # If `True`, and a limit is set that is below the number of matches on one page,
  # the links are chosen at random. Otherwise the first `limit` number are chosen.
  # Default:
  #random: True
--- a/src/soupjobs/run.py
+++ b/src/soupjobs/run.py
@@ -0,0 +1,71 @@
 from argparse import ArgumentParser, SUPPRESS
 from pathlib import Path
 from distutils.util import strtobool
 import asyncio
 import os
 from soupjobs import CONFIG_FILE_ENV_VAR
 if __name__ == '__main__':
    parser = ArgumentParser(
        argument_default=SUPPRESS,
        description="CLI tool for starting scraping jobs. "
                    "The provided settings always override environment variables and those in the config file. "
                    "For more detailed descriptions of options refer to the example config file."
    )
    parser.add_argument(
        '-c', '--config-file',
        type=Path,
        default=Path(Path(__file__).parent, 'config.yaml'),
        help="Specify a different config file. Defaults to 'config.yaml' in the same directory as this run script."
    )
    parser.add_argument(
        'entry_urls',
        type=str,
        nargs='*',
        help="Can be a url, list of urls or path to a text file with urls"
    )
    parser.add_argument(
        '-r', '--regex-mode',
        action='store_true',
        help="Set this flag to treat all filter strings as regular expressions"
    )
    parser.add_argument(
        '-d', '--max-depth',
        type=int,
        help="Maximum recursion depth for following matching links to other pages"
    )
    parser.add_argument(
        '-p', '--max-pages',
        type=int,
        help="Maximum number of pages to visit"
    )
    parser.add_argument(
        '--output-with-urls',
        action='store_true',
        help="Set this flag to map scraped results to the url of the page they were found on"
    )
    parser.add_argument(
        '-o', '--output-format',
        type=str,
        help="Set to either 'yaml', 'json' or 'simple' format"
    )
    kwargs = vars(parser.parse_args())
    os.environ[CONFIG_FILE_ENV_VAR] = str(kwargs.pop('config_file'))
    from soupjobs.settings import settings
    from soupjobs.scrape import Job
    for key, value in kwargs.items():
        print(key, value, type(value))
        setattr(settings, key, value)
    if not settings.entry_urls:
        print("No urls specified")
        exit()
    if not settings.has_target_filters():
        warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \
                  "Are you sure you want to proceed? (y/n)"
        proceed = strtobool(input(warning))
        if not proceed:
            print("Cancelled")
            exit()
    asyncio.run(Job().start())
--- a/src/soupjobs/scrape.py
+++ b/src/soupjobs/scrape.py
@@ -0,0 +1,209 @@
 from typing import Tuple, List, Dict, Callable, Any
 from urllib.parse import urlparse, urljoin
 import asyncio
 import random
 import re
 import json
 from aiohttp import ClientSession
 from bs4 import BeautifulSoup
 from bs4.element import Tag
 import yaml
 from soupjobs.settings import settings, OutputFormat, TagAttrs
 BS_PARSER = 'html.parser'
 class Job:
    def __init__(self, session: ClientSession = None):
        self.loop = None
        self.session = session
        self.lock = None
        self.results: Dict[str, List[Any]] = {}
        self.num_targets_found: int = 0
        self.num_links_followed: int = 0
        self.page_counter: int = 0
        self.entry_urls: List[str] = []
        urls = [settings.entry_urls] if isinstance(settings.entry_urls, str) else settings.entry_urls
        for url in urls:
            if is_valid_url(url):
                self.entry_urls.append(url)
            else:
                with open(url, 'r') as f:
                    self.entry_urls += f.readlines()
    async def init_async(self) -> None:
        self.loop = asyncio.get_event_loop()
        self.session = ClientSession(loop=self.loop)
        self.lock = asyncio.Lock()
    async def start(self) -> None:
        await self.init_async()
        print_start()
        try:
            await self.run(*self.entry_urls)
        finally:
            await self.session.close()
            print_end()
    async def run(self, *urls: str, depth: int = 0) -> None:
        async with self.lock:
            urls = list(set(urls).difference(*self.results.keys()))
            if settings.max_pages:
                num_requests_left = settings.max_pages - self.num_links_followed
                if num_requests_left <= 0:
                    return
                urls = urls[:num_requests_left]
                self.num_links_followed += len(urls)
        output = await asyncio.gather(*(self.get_and_scrape(url) for url in urls), loop=self.loop)
        assert isinstance(output, list)
        next_links = construct_next_urls(urls, output)
        if depth < settings.max_depth:
            await self.run(*next_links, depth=depth + 1)
    async def get_and_scrape(self, url: str) -> List[str]:
        async with self.lock:
            if settings.target_limit and self.num_targets_found >= settings.target_limit:
                return []
        async with self.session.get(url) as response:
            html = await response.text()
        targets, links = scrape_document(html)
        async with self.lock:
            self.page_counter += 1
            num_targets_left = settings.target_limit - self.num_targets_found if settings.target_limit else None
            targets = targets[:num_targets_left]
            self.results[url] = targets
            self.num_targets_found += len(targets)
            print_page_results(url, targets, self.limit_reached())
        return links
    def limit_reached(self) -> bool:
        if settings.max_pages and self.page_counter >= settings.max_pages:
            return True
        if settings.target_limit and self.num_targets_found >= settings.target_limit:
            return True
        return False
 def scrape_document(html: str) -> Tuple[List[Any], List[str]]:
    soup = BeautifulSoup(html, BS_PARSER)
    # Targets:
    targets = []
    for tag in soup.find_all(target_filter, limit=settings.target_limit):
        target = tag.text if settings.target_extract_text else str(tag)
        if settings.target_transform:
            target = settings.target_transform(target)
        targets.append(target)
    # Next links:
    if settings.next_link_random:
        links = soup.find_all(link_filter)
        if settings.next_link_limit and settings.next_link_limit < len(links):
            indices = list(range(len(links)))
            random.shuffle(indices)
            links = links[:settings.next_link_limit]
    else:
        links = soup.find_all(link_filter, limit=settings.next_link_limit)
    return targets, [a['href'] for a in links]
 def link_filter(tag: Tag) -> bool:
    try:
        if not string_matches(tag['href'], settings.next_link_href, settings.regex_mode):
            return False
    except KeyError:
        return False
    return tag_filter(tag, 'a', text=settings.next_link_text, attrs=settings.next_link_attrs,
                      func=settings.next_link_match_func, regex=settings.regex_mode)
 def target_filter(tag: Tag) -> bool:
    return tag_filter(tag, settings.target_tag, text=settings.target_text, attrs=settings.target_attrs,
                      func=settings.target_match_func, regex=settings.regex_mode)
 def tag_filter(tag: Tag, name: str = None, text: str = None, attrs: TagAttrs = None, func: Callable[[Tag], bool] = None,
               regex: bool = False) -> bool:
    """
    Returns `True` only if the `tag` matches all provided filter criteria.
    Built to be used in the `find_all` method from BeautifulSoup4.
    Args:
        tag:
            The BS4 Tag object to check
        name (optional):
            What kind of tag will be matched (e.g. 'a' would match an HTML anchor tag)
        text (optional):
            The text enclosed by the tag to be matched
        func (optional):
            Function to run on the tag for filtering (should return `True` if the tag matches)
        attrs (optional):
            Any additional attributes the tag should match, e.g. {'class': 'relevant'}
        regex (optional):
            If `True`, all checks are performed by matching the tag's attributes using the provided arguments as
            a regular expression, otherwise they are checked for string equality.
    """
    if not string_matches(tag.name, name, regex):
        return False
    if not string_matches(tag.text, text, regex):
        return False
    for attr_name, attr_value in attrs.items():
        try:
            if not string_matches(tag[attr_name], attr_value, regex):
                return False
        except KeyError:
            return False
    if func:
        return func(tag)
    return True
 def string_matches(search_string: str, expression: str = None, regex: bool = False) -> bool:
    if expression is None:
        return True
    if not regex:
        return search_string == expression
    return re.compile(expression).search(search_string) is not None
 def construct_next_urls(urls: List[str], next_links_lists: List[List[str]]) -> List[str]:
    output = set()
    for url, next_links in zip(urls, next_links_lists):
        for link in next_links:
            output.add(urljoin(url, link))
    return list(output)
 def is_valid_url(string: str) -> bool:
    parsed = urlparse(string)
    if not all([parsed.scheme, parsed.netloc]):
        return False
    if parsed.scheme not in ('http', 'https'):
        return False
    return True
 def print_start() -> None:
    if settings.output_format == OutputFormat.json:
        print('{') if settings.output_with_urls else print('[')
 def print_end() -> None:
    if settings.output_format == OutputFormat.json:
        print('}') if settings.output_with_urls else print(']')
 def print_page_results(url: str, targets: List[str], last: bool = False) -> None:
    if settings.output_format == OutputFormat.yaml:
        output = yaml.safe_dump({url: targets} if settings.output_with_urls else targets)
        end = ''
    elif settings.output_format == OutputFormat.json:
        output = json.dumps({url: targets} if settings.output_with_urls else targets, indent=2)[2:-2]
        end = '\n' if last else ',\n'
    else:
        output = '\n'.join(targets)
        if settings.output_with_urls:
            output = url + ':\n' + output
        end = '\n'
    print(output, end=end)
--- a/src/soupjobs/settings.py
+++ b/src/soupjobs/settings.py
@@ -0,0 +1,74 @@
 from typing import Dict, List, Union, Optional, Any
 from pathlib import Path
 from enum import Enum
 import os
 from pydantic import BaseSettings, PyObject
 import yaml
 from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
 TagAttrs = Dict[str, str]
 class OutputFormat(str, Enum):
    simple = 'simple'
    yaml = 'yaml'
    json = 'json'
 class Settings(BaseSettings):
    _CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
    entry_urls: Union[str, List[str]] = []
    regex_mode: bool = False
    max_depth: int = 0
    max_pages: Optional[int]
    output_with_urls: bool = False
    output_format: OutputFormat = OutputFormat.simple
    # Target to extract:
    target_tag: Optional[str]
    target_text: Optional[str]
    target_attrs: TagAttrs = {}
    target_match_func: Optional[PyObject]
    target_required: bool = False
    target_limit: Optional[int]
    target_extract_text: bool = True
    target_transform: Optional[PyObject]
    # Link to next page:
    next_link_text: Optional[str]
    next_link_href: Optional[str]
    next_link_attrs: TagAttrs = {}
    next_link_match_func: Optional[PyObject]
    next_link_limit: Optional[int]
    next_link_random: bool = True
    def has_target_filters(self) -> bool:
        return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
    @staticmethod
    def yaml_settings(_settings_obj: BaseSettings) -> Dict[str, Any]:
        try:
            with open(Settings._CONFIG_FILE, 'r') as f:
                config = yaml.safe_load(f)
        except FileNotFoundError:
            return {}
        for section_name in ('target', 'next_link'):
            section = config.pop(section_name, {})
            for key, value in section.items():
                config.setdefault(f'{section_name}_{key}', value)
        return config
    class Config:
        validate_assignment = True
        env_file_encoding = 'utf-8'
        @classmethod
        def customise_sources(cls, init_settings, env_settings, file_secret_settings):
            return init_settings, env_settings, file_secret_settings, Settings.yaml_settings
 settings = Settings()
		`@@ -0,0 +1,2 @@`
							`CONFIG_FILE_ENV_VAR = 'SOUPJOBS_CONFIG'`
							`CONFIG_FILE_PLACEHOLDER = 'placeholder'`