first working draft

This commit is contained in:
Daniil Fajnberg 2021-03-13 16:53:15 +01:00
parent f58ed9fee0
commit 65f8c7d4be
8 changed files with 487 additions and 1 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
config.yaml
# Python virtual environment: # Python virtual environment:
/.venv/ /.venv/
# pipenv lock-file and cache directory: # pipenv lock-file and cache directory:

View File

@ -6,6 +6,8 @@ name = "pypi"
[packages] [packages]
beautifulsoup4 = "*" beautifulsoup4 = "*"
aiohttp = "*" aiohttp = "*"
pydantic = "*"
pyyaml = "*"
[dev-packages] [dev-packages]

View File

@ -14,9 +14,12 @@ setuptools.setup(
# url="https://github.com/...", # url="https://github.com/...",
package_dir={'': 'src'}, package_dir={'': 'src'},
packages=setuptools.find_packages(where='src'), packages=setuptools.find_packages(where='src'),
package_data={'soupjobs': ['example.config.yaml']},
install_requires=[ install_requires=[
'aiohttp', 'aiohttp',
'beautifulsoup4' 'beautifulsoup4',
'pydantic',
'pyyaml',
], ],
extras_require={ extras_require={
'tests': ['coverage'], 'tests': ['coverage'],

2
src/soupjobs/__init__.py Normal file
View File

@ -0,0 +1,2 @@
CONFIG_FILE_ENV_VAR = 'SOUPJOBS_CONFIG'
CONFIG_FILE_PLACEHOLDER = 'placeholder'

View File

@ -0,0 +1,124 @@
############################
# Scrape job configuration #
############################
# All possible config parameters are explained here with either the default or an example value provided with them.
# General section
#################
# If specified as a list, the elements are interpreted as urls to visit and scrape;
# if specified as a string that has valid url format, the corresponding page is visited and scraped;
# otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped.
entry_urls:
- https://en.wikipedia.org/wiki/Python_(programming_language)
- https://en.wikipedia.org/wiki/Guido_van_Rossum
# If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as
# regular expressions to match the attribute values, otherwise they are simply checked for string equality.
# Default:
#regex_mode: False
regex_mode: True
# The maximum recursion depth for following matching links to other pages starting from the `entry_urls`.
# For example, a `max_depth` of 2 means that for every entry page the matching links may be followed,
# and then for every one of those, the matching links may be followed again, but then the job would end.
# Default:
#max_depth: 0
max_depth: 2
# Maximum number of pages to visit.
# Example:
#max_pages: 100
# If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values;
# otherwise the output is merely a list of all extracted matches.
# Default:
#output_with_urls: False
# Output can be produced in either 'yaml', 'json' or 'simple' format.
# The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and
# add the url of the page they were extracted from in an additional line before them if that setting is `True`.
# Default:
#output_format: simple
# Target section
# ##############
# The following section is used to specify filter criteria for the target HTML tags to extract from a page.
# Only HTML tags matching *all* of the specified filters will be extracted.
target:
# Filter by HTML tag
# Example to only look for <h1> tags:
tag: h1
# Filter by text inside the tag
# Example:
#text: program
# Filter by any valid HTML attributes
attrs:
# Examples:
id: firstHeading
#class: foo
#role: bar
# Filter using a custom python function with the path specified in dot-notation.
# The function should take a BS4 Tag object as its sole argument and return `True` if the tag matches.
# Example:
#match_func: module.function
# If this is set to `True` and no matching tags are found on a page, an exception is raised.
# Default:
#required: False
# Stop doing requests as soon as possible when this number of matches were extracted.
# Note that setting this parameter will restrict the number of returned targets to no more than is set here,
# but in asynchronous execution the total number of requests made and targets scraped may be higher.
# Example:
limit: 20
# If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted.
# Default:
#extract_text: True
# Optional transformation function to apply to every matching target tag.
# Should take a BS4 Tag object as its sole argument and return a string.
# Example:
#transform: module.function
# Links section
###############
# This section is used to specify filter criteria for links (<a> tags) to pick for recursive scraping.
# Only HTML tags matching *all* of the specified filters will be considered.
# The linked pages will be recursively scraped at most to the depth specified by the `max_depth` parameter.
next_link:
# Filter by text inside the anchor tag
# Example:
#text: Another important page
# Filter by the `href` attribute of the anchor tag.
# Example:
href: '^/wiki/\w+'
# Filter by any other valid HTML attributes
# Example:
#attrs:
#class: result
#rel: noopener
# Function filter; same as in the `target` section.
# Example:
#match_func: module.function
# Get at most this many links to other pages from one page.
# Example:
limit: 10
# If `True`, and a limit is set that is below the number of matches on one page,
# the links are chosen at random. Otherwise the first `limit` number are chosen.
# Default:
#random: True

71
src/soupjobs/run.py Normal file
View File

@ -0,0 +1,71 @@
from argparse import ArgumentParser, SUPPRESS
from pathlib import Path
from distutils.util import strtobool
import asyncio
import os
from soupjobs import CONFIG_FILE_ENV_VAR
if __name__ == '__main__':
parser = ArgumentParser(
argument_default=SUPPRESS,
description="CLI tool for starting scraping jobs. "
"The provided settings always override environment variables and those in the config file. "
"For more detailed descriptions of options refer to the example config file."
)
parser.add_argument(
'-c', '--config-file',
type=Path,
default=Path(Path(__file__).parent, 'config.yaml'),
help="Specify a different config file. Defaults to 'config.yaml' in the same directory as this run script."
)
parser.add_argument(
'entry_urls',
type=str,
nargs='*',
help="Can be a url, list of urls or path to a text file with urls"
)
parser.add_argument(
'-r', '--regex-mode',
action='store_true',
help="Set this flag to treat all filter strings as regular expressions"
)
parser.add_argument(
'-d', '--max-depth',
type=int,
help="Maximum recursion depth for following matching links to other pages"
)
parser.add_argument(
'-p', '--max-pages',
type=int,
help="Maximum number of pages to visit"
)
parser.add_argument(
'--output-with-urls',
action='store_true',
help="Set this flag to map scraped results to the url of the page they were found on"
)
parser.add_argument(
'-o', '--output-format',
type=str,
help="Set to either 'yaml', 'json' or 'simple' format"
)
kwargs = vars(parser.parse_args())
os.environ[CONFIG_FILE_ENV_VAR] = str(kwargs.pop('config_file'))
from soupjobs.settings import settings
from soupjobs.scrape import Job
for key, value in kwargs.items():
print(key, value, type(value))
setattr(settings, key, value)
if not settings.entry_urls:
print("No urls specified")
exit()
if not settings.has_target_filters():
warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \
"Are you sure you want to proceed? (y/n)"
proceed = strtobool(input(warning))
if not proceed:
print("Cancelled")
exit()
asyncio.run(Job().start())

209
src/soupjobs/scrape.py Normal file
View File

@ -0,0 +1,209 @@
from typing import Tuple, List, Dict, Callable, Any
from urllib.parse import urlparse, urljoin
import asyncio
import random
import re
import json
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from bs4.element import Tag
import yaml
from soupjobs.settings import settings, OutputFormat, TagAttrs
BS_PARSER = 'html.parser'
class Job:
def __init__(self, session: ClientSession = None):
self.loop = None
self.session = session
self.lock = None
self.results: Dict[str, List[Any]] = {}
self.num_targets_found: int = 0
self.num_links_followed: int = 0
self.page_counter: int = 0
self.entry_urls: List[str] = []
urls = [settings.entry_urls] if isinstance(settings.entry_urls, str) else settings.entry_urls
for url in urls:
if is_valid_url(url):
self.entry_urls.append(url)
else:
with open(url, 'r') as f:
self.entry_urls += f.readlines()
async def init_async(self) -> None:
self.loop = asyncio.get_event_loop()
self.session = ClientSession(loop=self.loop)
self.lock = asyncio.Lock()
async def start(self) -> None:
await self.init_async()
print_start()
try:
await self.run(*self.entry_urls)
finally:
await self.session.close()
print_end()
async def run(self, *urls: str, depth: int = 0) -> None:
async with self.lock:
urls = list(set(urls).difference(*self.results.keys()))
if settings.max_pages:
num_requests_left = settings.max_pages - self.num_links_followed
if num_requests_left <= 0:
return
urls = urls[:num_requests_left]
self.num_links_followed += len(urls)
output = await asyncio.gather(*(self.get_and_scrape(url) for url in urls), loop=self.loop)
assert isinstance(output, list)
next_links = construct_next_urls(urls, output)
if depth < settings.max_depth:
await self.run(*next_links, depth=depth + 1)
async def get_and_scrape(self, url: str) -> List[str]:
async with self.lock:
if settings.target_limit and self.num_targets_found >= settings.target_limit:
return []
async with self.session.get(url) as response:
html = await response.text()
targets, links = scrape_document(html)
async with self.lock:
self.page_counter += 1
num_targets_left = settings.target_limit - self.num_targets_found if settings.target_limit else None
targets = targets[:num_targets_left]
self.results[url] = targets
self.num_targets_found += len(targets)
print_page_results(url, targets, self.limit_reached())
return links
def limit_reached(self) -> bool:
if settings.max_pages and self.page_counter >= settings.max_pages:
return True
if settings.target_limit and self.num_targets_found >= settings.target_limit:
return True
return False
def scrape_document(html: str) -> Tuple[List[Any], List[str]]:
soup = BeautifulSoup(html, BS_PARSER)
# Targets:
targets = []
for tag in soup.find_all(target_filter, limit=settings.target_limit):
target = tag.text if settings.target_extract_text else str(tag)
if settings.target_transform:
target = settings.target_transform(target)
targets.append(target)
# Next links:
if settings.next_link_random:
links = soup.find_all(link_filter)
if settings.next_link_limit and settings.next_link_limit < len(links):
indices = list(range(len(links)))
random.shuffle(indices)
links = links[:settings.next_link_limit]
else:
links = soup.find_all(link_filter, limit=settings.next_link_limit)
return targets, [a['href'] for a in links]
def link_filter(tag: Tag) -> bool:
try:
if not string_matches(tag['href'], settings.next_link_href, settings.regex_mode):
return False
except KeyError:
return False
return tag_filter(tag, 'a', text=settings.next_link_text, attrs=settings.next_link_attrs,
func=settings.next_link_match_func, regex=settings.regex_mode)
def target_filter(tag: Tag) -> bool:
return tag_filter(tag, settings.target_tag, text=settings.target_text, attrs=settings.target_attrs,
func=settings.target_match_func, regex=settings.regex_mode)
def tag_filter(tag: Tag, name: str = None, text: str = None, attrs: TagAttrs = None, func: Callable[[Tag], bool] = None,
regex: bool = False) -> bool:
"""
Returns `True` only if the `tag` matches all provided filter criteria.
Built to be used in the `find_all` method from BeautifulSoup4.
Args:
tag:
The BS4 Tag object to check
name (optional):
What kind of tag will be matched (e.g. 'a' would match an HTML anchor tag)
text (optional):
The text enclosed by the tag to be matched
func (optional):
Function to run on the tag for filtering (should return `True` if the tag matches)
attrs (optional):
Any additional attributes the tag should match, e.g. {'class': 'relevant'}
regex (optional):
If `True`, all checks are performed by matching the tag's attributes using the provided arguments as
a regular expression, otherwise they are checked for string equality.
"""
if not string_matches(tag.name, name, regex):
return False
if not string_matches(tag.text, text, regex):
return False
for attr_name, attr_value in attrs.items():
try:
if not string_matches(tag[attr_name], attr_value, regex):
return False
except KeyError:
return False
if func:
return func(tag)
return True
def string_matches(search_string: str, expression: str = None, regex: bool = False) -> bool:
if expression is None:
return True
if not regex:
return search_string == expression
return re.compile(expression).search(search_string) is not None
def construct_next_urls(urls: List[str], next_links_lists: List[List[str]]) -> List[str]:
output = set()
for url, next_links in zip(urls, next_links_lists):
for link in next_links:
output.add(urljoin(url, link))
return list(output)
def is_valid_url(string: str) -> bool:
parsed = urlparse(string)
if not all([parsed.scheme, parsed.netloc]):
return False
if parsed.scheme not in ('http', 'https'):
return False
return True
def print_start() -> None:
if settings.output_format == OutputFormat.json:
print('{') if settings.output_with_urls else print('[')
def print_end() -> None:
if settings.output_format == OutputFormat.json:
print('}') if settings.output_with_urls else print(']')
def print_page_results(url: str, targets: List[str], last: bool = False) -> None:
if settings.output_format == OutputFormat.yaml:
output = yaml.safe_dump({url: targets} if settings.output_with_urls else targets)
end = ''
elif settings.output_format == OutputFormat.json:
output = json.dumps({url: targets} if settings.output_with_urls else targets, indent=2)[2:-2]
end = '\n' if last else ',\n'
else:
output = '\n'.join(targets)
if settings.output_with_urls:
output = url + ':\n' + output
end = '\n'
print(output, end=end)

74
src/soupjobs/settings.py Normal file
View File

@ -0,0 +1,74 @@
from typing import Dict, List, Union, Optional, Any
from pathlib import Path
from enum import Enum
import os
from pydantic import BaseSettings, PyObject
import yaml
from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
TagAttrs = Dict[str, str]
class OutputFormat(str, Enum):
simple = 'simple'
yaml = 'yaml'
json = 'json'
class Settings(BaseSettings):
_CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
entry_urls: Union[str, List[str]] = []
regex_mode: bool = False
max_depth: int = 0
max_pages: Optional[int]
output_with_urls: bool = False
output_format: OutputFormat = OutputFormat.simple
# Target to extract:
target_tag: Optional[str]
target_text: Optional[str]
target_attrs: TagAttrs = {}
target_match_func: Optional[PyObject]
target_required: bool = False
target_limit: Optional[int]
target_extract_text: bool = True
target_transform: Optional[PyObject]
# Link to next page:
next_link_text: Optional[str]
next_link_href: Optional[str]
next_link_attrs: TagAttrs = {}
next_link_match_func: Optional[PyObject]
next_link_limit: Optional[int]
next_link_random: bool = True
def has_target_filters(self) -> bool:
return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
@staticmethod
def yaml_settings(_settings_obj: BaseSettings) -> Dict[str, Any]:
try:
with open(Settings._CONFIG_FILE, 'r') as f:
config = yaml.safe_load(f)
except FileNotFoundError:
return {}
for section_name in ('target', 'next_link'):
section = config.pop(section_name, {})
for key, value in section.items():
config.setdefault(f'{section_name}_{key}', value)
return config
class Config:
validate_assignment = True
env_file_encoding = 'utf-8'
@classmethod
def customise_sources(cls, init_settings, env_settings, file_secret_settings):
return init_settings, env_settings, file_secret_settings, Settings.yaml_settings
settings = Settings()