first working draft
This commit is contained in:
parent
f58ed9fee0
commit
65f8c7d4be
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,3 +1,4 @@
|
||||
config.yaml
|
||||
# Python virtual environment:
|
||||
/.venv/
|
||||
# pipenv lock-file and cache directory:
|
||||
|
2
Pipfile
2
Pipfile
@ -6,6 +6,8 @@ name = "pypi"
|
||||
[packages]
|
||||
beautifulsoup4 = "*"
|
||||
aiohttp = "*"
|
||||
pydantic = "*"
|
||||
pyyaml = "*"
|
||||
|
||||
[dev-packages]
|
||||
|
||||
|
5
setup.py
5
setup.py
@ -14,9 +14,12 @@ setuptools.setup(
|
||||
# url="https://github.com/...",
|
||||
package_dir={'': 'src'},
|
||||
packages=setuptools.find_packages(where='src'),
|
||||
package_data={'soupjobs': ['example.config.yaml']},
|
||||
install_requires=[
|
||||
'aiohttp',
|
||||
'beautifulsoup4'
|
||||
'beautifulsoup4',
|
||||
'pydantic',
|
||||
'pyyaml',
|
||||
],
|
||||
extras_require={
|
||||
'tests': ['coverage'],
|
||||
|
2
src/soupjobs/__init__.py
Normal file
2
src/soupjobs/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
CONFIG_FILE_ENV_VAR = 'SOUPJOBS_CONFIG'
|
||||
CONFIG_FILE_PLACEHOLDER = 'placeholder'
|
124
src/soupjobs/example.config.yaml
Normal file
124
src/soupjobs/example.config.yaml
Normal file
@ -0,0 +1,124 @@
|
||||
############################
|
||||
# Scrape job configuration #
|
||||
############################
|
||||
|
||||
# All possible config parameters are explained here with either the default or an example value provided with them.
|
||||
|
||||
# General section
|
||||
#################
|
||||
|
||||
# If specified as a list, the elements are interpreted as urls to visit and scrape;
|
||||
# if specified as a string that has valid url format, the corresponding page is visited and scraped;
|
||||
# otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped.
|
||||
entry_urls:
|
||||
- https://en.wikipedia.org/wiki/Python_(programming_language)
|
||||
- https://en.wikipedia.org/wiki/Guido_van_Rossum
|
||||
|
||||
# If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as
|
||||
# regular expressions to match the attribute values, otherwise they are simply checked for string equality.
|
||||
# Default:
|
||||
#regex_mode: False
|
||||
regex_mode: True
|
||||
|
||||
# The maximum recursion depth for following matching links to other pages starting from the `entry_urls`.
|
||||
# For example, a `max_depth` of 2 means that for every entry page the matching links may be followed,
|
||||
# and then for every one of those, the matching links may be followed again, but then the job would end.
|
||||
# Default:
|
||||
#max_depth: 0
|
||||
max_depth: 2
|
||||
|
||||
# Maximum number of pages to visit.
|
||||
# Example:
|
||||
#max_pages: 100
|
||||
|
||||
# If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values;
|
||||
# otherwise the output is merely a list of all extracted matches.
|
||||
# Default:
|
||||
#output_with_urls: False
|
||||
|
||||
# Output can be produced in either 'yaml', 'json' or 'simple' format.
|
||||
# The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and
|
||||
# add the url of the page they were extracted from in an additional line before them if that setting is `True`.
|
||||
# Default:
|
||||
#output_format: simple
|
||||
|
||||
|
||||
# Target section
|
||||
# ##############
|
||||
|
||||
# The following section is used to specify filter criteria for the target HTML tags to extract from a page.
|
||||
# Only HTML tags matching *all* of the specified filters will be extracted.
|
||||
target:
|
||||
# Filter by HTML tag
|
||||
# Example to only look for <h1> tags:
|
||||
tag: h1
|
||||
|
||||
# Filter by text inside the tag
|
||||
# Example:
|
||||
#text: program
|
||||
|
||||
# Filter by any valid HTML attributes
|
||||
attrs:
|
||||
# Examples:
|
||||
id: firstHeading
|
||||
#class: foo
|
||||
#role: bar
|
||||
|
||||
# Filter using a custom python function with the path specified in dot-notation.
|
||||
# The function should take a BS4 Tag object as its sole argument and return `True` if the tag matches.
|
||||
# Example:
|
||||
#match_func: module.function
|
||||
|
||||
# If this is set to `True` and no matching tags are found on a page, an exception is raised.
|
||||
# Default:
|
||||
#required: False
|
||||
|
||||
# Stop doing requests as soon as possible when this number of matches were extracted.
|
||||
# Note that setting this parameter will restrict the number of returned targets to no more than is set here,
|
||||
# but in asynchronous execution the total number of requests made and targets scraped may be higher.
|
||||
# Example:
|
||||
limit: 20
|
||||
|
||||
# If `True`, each matching target tag's text content is extracted; otherwise the entire tag is extracted.
|
||||
# Default:
|
||||
#extract_text: True
|
||||
|
||||
# Optional transformation function to apply to every matching target tag.
|
||||
# Should take a BS4 Tag object as its sole argument and return a string.
|
||||
# Example:
|
||||
#transform: module.function
|
||||
|
||||
|
||||
# Links section
|
||||
###############
|
||||
|
||||
# This section is used to specify filter criteria for links (<a> tags) to pick for recursive scraping.
|
||||
# Only HTML tags matching *all* of the specified filters will be considered.
|
||||
# The linked pages will be recursively scraped at most to the depth specified by the `max_depth` parameter.
|
||||
next_link:
|
||||
# Filter by text inside the anchor tag
|
||||
# Example:
|
||||
#text: Another important page
|
||||
|
||||
# Filter by the `href` attribute of the anchor tag.
|
||||
# Example:
|
||||
href: '^/wiki/\w+'
|
||||
|
||||
# Filter by any other valid HTML attributes
|
||||
# Example:
|
||||
#attrs:
|
||||
#class: result
|
||||
#rel: noopener
|
||||
|
||||
# Function filter; same as in the `target` section.
|
||||
# Example:
|
||||
#match_func: module.function
|
||||
|
||||
# Get at most this many links to other pages from one page.
|
||||
# Example:
|
||||
limit: 10
|
||||
|
||||
# If `True`, and a limit is set that is below the number of matches on one page,
|
||||
# the links are chosen at random. Otherwise the first `limit` number are chosen.
|
||||
# Default:
|
||||
#random: True
|
71
src/soupjobs/run.py
Normal file
71
src/soupjobs/run.py
Normal file
@ -0,0 +1,71 @@
|
||||
from argparse import ArgumentParser, SUPPRESS
|
||||
from pathlib import Path
|
||||
from distutils.util import strtobool
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from soupjobs import CONFIG_FILE_ENV_VAR
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser(
|
||||
argument_default=SUPPRESS,
|
||||
description="CLI tool for starting scraping jobs. "
|
||||
"The provided settings always override environment variables and those in the config file. "
|
||||
"For more detailed descriptions of options refer to the example config file."
|
||||
)
|
||||
parser.add_argument(
|
||||
'-c', '--config-file',
|
||||
type=Path,
|
||||
default=Path(Path(__file__).parent, 'config.yaml'),
|
||||
help="Specify a different config file. Defaults to 'config.yaml' in the same directory as this run script."
|
||||
)
|
||||
parser.add_argument(
|
||||
'entry_urls',
|
||||
type=str,
|
||||
nargs='*',
|
||||
help="Can be a url, list of urls or path to a text file with urls"
|
||||
)
|
||||
parser.add_argument(
|
||||
'-r', '--regex-mode',
|
||||
action='store_true',
|
||||
help="Set this flag to treat all filter strings as regular expressions"
|
||||
)
|
||||
parser.add_argument(
|
||||
'-d', '--max-depth',
|
||||
type=int,
|
||||
help="Maximum recursion depth for following matching links to other pages"
|
||||
)
|
||||
parser.add_argument(
|
||||
'-p', '--max-pages',
|
||||
type=int,
|
||||
help="Maximum number of pages to visit"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output-with-urls',
|
||||
action='store_true',
|
||||
help="Set this flag to map scraped results to the url of the page they were found on"
|
||||
)
|
||||
parser.add_argument(
|
||||
'-o', '--output-format',
|
||||
type=str,
|
||||
help="Set to either 'yaml', 'json' or 'simple' format"
|
||||
)
|
||||
kwargs = vars(parser.parse_args())
|
||||
os.environ[CONFIG_FILE_ENV_VAR] = str(kwargs.pop('config_file'))
|
||||
from soupjobs.settings import settings
|
||||
from soupjobs.scrape import Job
|
||||
for key, value in kwargs.items():
|
||||
print(key, value, type(value))
|
||||
setattr(settings, key, value)
|
||||
if not settings.entry_urls:
|
||||
print("No urls specified")
|
||||
exit()
|
||||
if not settings.has_target_filters():
|
||||
warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \
|
||||
"Are you sure you want to proceed? (y/n)"
|
||||
proceed = strtobool(input(warning))
|
||||
if not proceed:
|
||||
print("Cancelled")
|
||||
exit()
|
||||
asyncio.run(Job().start())
|
209
src/soupjobs/scrape.py
Normal file
209
src/soupjobs/scrape.py
Normal file
@ -0,0 +1,209 @@
|
||||
from typing import Tuple, List, Dict, Callable, Any
|
||||
from urllib.parse import urlparse, urljoin
|
||||
import asyncio
|
||||
import random
|
||||
import re
|
||||
import json
|
||||
|
||||
from aiohttp import ClientSession
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
import yaml
|
||||
|
||||
from soupjobs.settings import settings, OutputFormat, TagAttrs
|
||||
|
||||
|
||||
BS_PARSER = 'html.parser'
|
||||
|
||||
|
||||
class Job:
|
||||
def __init__(self, session: ClientSession = None):
|
||||
self.loop = None
|
||||
self.session = session
|
||||
self.lock = None
|
||||
self.results: Dict[str, List[Any]] = {}
|
||||
self.num_targets_found: int = 0
|
||||
self.num_links_followed: int = 0
|
||||
self.page_counter: int = 0
|
||||
self.entry_urls: List[str] = []
|
||||
urls = [settings.entry_urls] if isinstance(settings.entry_urls, str) else settings.entry_urls
|
||||
for url in urls:
|
||||
if is_valid_url(url):
|
||||
self.entry_urls.append(url)
|
||||
else:
|
||||
with open(url, 'r') as f:
|
||||
self.entry_urls += f.readlines()
|
||||
|
||||
async def init_async(self) -> None:
|
||||
self.loop = asyncio.get_event_loop()
|
||||
self.session = ClientSession(loop=self.loop)
|
||||
self.lock = asyncio.Lock()
|
||||
|
||||
async def start(self) -> None:
|
||||
await self.init_async()
|
||||
print_start()
|
||||
try:
|
||||
await self.run(*self.entry_urls)
|
||||
finally:
|
||||
await self.session.close()
|
||||
print_end()
|
||||
|
||||
async def run(self, *urls: str, depth: int = 0) -> None:
|
||||
async with self.lock:
|
||||
urls = list(set(urls).difference(*self.results.keys()))
|
||||
if settings.max_pages:
|
||||
num_requests_left = settings.max_pages - self.num_links_followed
|
||||
if num_requests_left <= 0:
|
||||
return
|
||||
urls = urls[:num_requests_left]
|
||||
self.num_links_followed += len(urls)
|
||||
output = await asyncio.gather(*(self.get_and_scrape(url) for url in urls), loop=self.loop)
|
||||
assert isinstance(output, list)
|
||||
next_links = construct_next_urls(urls, output)
|
||||
if depth < settings.max_depth:
|
||||
await self.run(*next_links, depth=depth + 1)
|
||||
|
||||
async def get_and_scrape(self, url: str) -> List[str]:
|
||||
async with self.lock:
|
||||
if settings.target_limit and self.num_targets_found >= settings.target_limit:
|
||||
return []
|
||||
async with self.session.get(url) as response:
|
||||
html = await response.text()
|
||||
targets, links = scrape_document(html)
|
||||
async with self.lock:
|
||||
self.page_counter += 1
|
||||
num_targets_left = settings.target_limit - self.num_targets_found if settings.target_limit else None
|
||||
targets = targets[:num_targets_left]
|
||||
self.results[url] = targets
|
||||
self.num_targets_found += len(targets)
|
||||
print_page_results(url, targets, self.limit_reached())
|
||||
return links
|
||||
|
||||
def limit_reached(self) -> bool:
|
||||
if settings.max_pages and self.page_counter >= settings.max_pages:
|
||||
return True
|
||||
if settings.target_limit and self.num_targets_found >= settings.target_limit:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def scrape_document(html: str) -> Tuple[List[Any], List[str]]:
|
||||
soup = BeautifulSoup(html, BS_PARSER)
|
||||
# Targets:
|
||||
targets = []
|
||||
for tag in soup.find_all(target_filter, limit=settings.target_limit):
|
||||
target = tag.text if settings.target_extract_text else str(tag)
|
||||
if settings.target_transform:
|
||||
target = settings.target_transform(target)
|
||||
targets.append(target)
|
||||
# Next links:
|
||||
if settings.next_link_random:
|
||||
links = soup.find_all(link_filter)
|
||||
if settings.next_link_limit and settings.next_link_limit < len(links):
|
||||
indices = list(range(len(links)))
|
||||
random.shuffle(indices)
|
||||
links = links[:settings.next_link_limit]
|
||||
else:
|
||||
links = soup.find_all(link_filter, limit=settings.next_link_limit)
|
||||
return targets, [a['href'] for a in links]
|
||||
|
||||
|
||||
def link_filter(tag: Tag) -> bool:
|
||||
try:
|
||||
if not string_matches(tag['href'], settings.next_link_href, settings.regex_mode):
|
||||
return False
|
||||
except KeyError:
|
||||
return False
|
||||
return tag_filter(tag, 'a', text=settings.next_link_text, attrs=settings.next_link_attrs,
|
||||
func=settings.next_link_match_func, regex=settings.regex_mode)
|
||||
|
||||
|
||||
def target_filter(tag: Tag) -> bool:
|
||||
return tag_filter(tag, settings.target_tag, text=settings.target_text, attrs=settings.target_attrs,
|
||||
func=settings.target_match_func, regex=settings.regex_mode)
|
||||
|
||||
|
||||
def tag_filter(tag: Tag, name: str = None, text: str = None, attrs: TagAttrs = None, func: Callable[[Tag], bool] = None,
|
||||
regex: bool = False) -> bool:
|
||||
"""
|
||||
Returns `True` only if the `tag` matches all provided filter criteria.
|
||||
Built to be used in the `find_all` method from BeautifulSoup4.
|
||||
|
||||
Args:
|
||||
tag:
|
||||
The BS4 Tag object to check
|
||||
name (optional):
|
||||
What kind of tag will be matched (e.g. 'a' would match an HTML anchor tag)
|
||||
text (optional):
|
||||
The text enclosed by the tag to be matched
|
||||
func (optional):
|
||||
Function to run on the tag for filtering (should return `True` if the tag matches)
|
||||
attrs (optional):
|
||||
Any additional attributes the tag should match, e.g. {'class': 'relevant'}
|
||||
regex (optional):
|
||||
If `True`, all checks are performed by matching the tag's attributes using the provided arguments as
|
||||
a regular expression, otherwise they are checked for string equality.
|
||||
"""
|
||||
if not string_matches(tag.name, name, regex):
|
||||
return False
|
||||
if not string_matches(tag.text, text, regex):
|
||||
return False
|
||||
for attr_name, attr_value in attrs.items():
|
||||
try:
|
||||
if not string_matches(tag[attr_name], attr_value, regex):
|
||||
return False
|
||||
except KeyError:
|
||||
return False
|
||||
if func:
|
||||
return func(tag)
|
||||
return True
|
||||
|
||||
|
||||
def string_matches(search_string: str, expression: str = None, regex: bool = False) -> bool:
|
||||
if expression is None:
|
||||
return True
|
||||
if not regex:
|
||||
return search_string == expression
|
||||
return re.compile(expression).search(search_string) is not None
|
||||
|
||||
|
||||
def construct_next_urls(urls: List[str], next_links_lists: List[List[str]]) -> List[str]:
|
||||
output = set()
|
||||
for url, next_links in zip(urls, next_links_lists):
|
||||
for link in next_links:
|
||||
output.add(urljoin(url, link))
|
||||
return list(output)
|
||||
|
||||
|
||||
def is_valid_url(string: str) -> bool:
|
||||
parsed = urlparse(string)
|
||||
if not all([parsed.scheme, parsed.netloc]):
|
||||
return False
|
||||
if parsed.scheme not in ('http', 'https'):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def print_start() -> None:
|
||||
if settings.output_format == OutputFormat.json:
|
||||
print('{') if settings.output_with_urls else print('[')
|
||||
|
||||
|
||||
def print_end() -> None:
|
||||
if settings.output_format == OutputFormat.json:
|
||||
print('}') if settings.output_with_urls else print(']')
|
||||
|
||||
|
||||
def print_page_results(url: str, targets: List[str], last: bool = False) -> None:
|
||||
if settings.output_format == OutputFormat.yaml:
|
||||
output = yaml.safe_dump({url: targets} if settings.output_with_urls else targets)
|
||||
end = ''
|
||||
elif settings.output_format == OutputFormat.json:
|
||||
output = json.dumps({url: targets} if settings.output_with_urls else targets, indent=2)[2:-2]
|
||||
end = '\n' if last else ',\n'
|
||||
else:
|
||||
output = '\n'.join(targets)
|
||||
if settings.output_with_urls:
|
||||
output = url + ':\n' + output
|
||||
end = '\n'
|
||||
print(output, end=end)
|
74
src/soupjobs/settings.py
Normal file
74
src/soupjobs/settings.py
Normal file
@ -0,0 +1,74 @@
|
||||
from typing import Dict, List, Union, Optional, Any
|
||||
from pathlib import Path
|
||||
from enum import Enum
|
||||
import os
|
||||
|
||||
from pydantic import BaseSettings, PyObject
|
||||
import yaml
|
||||
|
||||
from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER
|
||||
|
||||
|
||||
TagAttrs = Dict[str, str]
|
||||
|
||||
|
||||
class OutputFormat(str, Enum):
|
||||
simple = 'simple'
|
||||
yaml = 'yaml'
|
||||
json = 'json'
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
_CONFIG_FILE: Optional[Path] = os.getenv(CONFIG_FILE_ENV_VAR, CONFIG_FILE_PLACEHOLDER)
|
||||
|
||||
entry_urls: Union[str, List[str]] = []
|
||||
regex_mode: bool = False
|
||||
max_depth: int = 0
|
||||
max_pages: Optional[int]
|
||||
output_with_urls: bool = False
|
||||
output_format: OutputFormat = OutputFormat.simple
|
||||
|
||||
# Target to extract:
|
||||
target_tag: Optional[str]
|
||||
target_text: Optional[str]
|
||||
target_attrs: TagAttrs = {}
|
||||
target_match_func: Optional[PyObject]
|
||||
target_required: bool = False
|
||||
target_limit: Optional[int]
|
||||
target_extract_text: bool = True
|
||||
target_transform: Optional[PyObject]
|
||||
|
||||
# Link to next page:
|
||||
next_link_text: Optional[str]
|
||||
next_link_href: Optional[str]
|
||||
next_link_attrs: TagAttrs = {}
|
||||
next_link_match_func: Optional[PyObject]
|
||||
next_link_limit: Optional[int]
|
||||
next_link_random: bool = True
|
||||
|
||||
def has_target_filters(self) -> bool:
|
||||
return any([self.target_tag, self.target_text, self.target_attrs, self.target_match_func])
|
||||
|
||||
@staticmethod
|
||||
def yaml_settings(_settings_obj: BaseSettings) -> Dict[str, Any]:
|
||||
try:
|
||||
with open(Settings._CONFIG_FILE, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
except FileNotFoundError:
|
||||
return {}
|
||||
for section_name in ('target', 'next_link'):
|
||||
section = config.pop(section_name, {})
|
||||
for key, value in section.items():
|
||||
config.setdefault(f'{section_name}_{key}', value)
|
||||
return config
|
||||
|
||||
class Config:
|
||||
validate_assignment = True
|
||||
env_file_encoding = 'utf-8'
|
||||
|
||||
@classmethod
|
||||
def customise_sources(cls, init_settings, env_settings, file_secret_settings):
|
||||
return init_settings, env_settings, file_secret_settings, Settings.yaml_settings
|
||||
|
||||
|
||||
settings = Settings()
|
Loading…
Reference in New Issue
Block a user