soupjobs/src/soupjobs/run.py

72 lines
2.5 KiB
Python

from argparse import ArgumentParser, SUPPRESS
from pathlib import Path
from distutils.util import strtobool
import asyncio
import os
from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_DEFAULT_NAME
if __name__ == '__main__':
parser = ArgumentParser(
argument_default=SUPPRESS,
description="CLI tool for starting scraping jobs. "
"The provided settings always override environment variables and those in the config file. "
"For more detailed descriptions of options refer to the example config file."
)
parser.add_argument(
'-c', '--config-file',
type=Path,
default=Path(Path.cwd(), CONFIG_FILE_DEFAULT_NAME),
help="Specify a different config file path. Defaults to 'config.yaml' in the current working directory."
)
parser.add_argument(
'entry_urls',
type=str,
nargs='*',
help="Can be a url, list of urls or path to a text file with urls"
)
parser.add_argument(
'-r', '--regex-mode',
action='store_true',
help="Set this flag to treat all filter strings as regular expressions"
)
parser.add_argument(
'-d', '--max-depth',
type=int,
help="Maximum recursion depth for following matching links to other pages"
)
parser.add_argument(
'-p', '--max-pages',
type=int,
help="Maximum number of pages to visit"
)
parser.add_argument(
'--output-with-urls',
action='store_true',
help="Set this flag to map scraped results to the url of the page they were found on"
)
parser.add_argument(
'-o', '--output-format',
type=str,
help="Set to either 'yaml', 'json' or 'simple' format"
)
kwargs = vars(parser.parse_args())
os.environ[CONFIG_FILE_ENV_VAR] = str(kwargs.pop('config_file'))
from soupjobs.settings import settings
from soupjobs.scrape import Job
for key, value in kwargs.items():
print(key, value, type(value))
setattr(settings, key, value)
if not settings.entry_urls:
print("No urls specified")
exit()
if not settings.has_target_filters():
warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \
"Are you sure you want to proceed? (y/n)"
proceed = strtobool(input(warning).lower())
if not proceed:
print("Cancelled")
exit()
asyncio.run(Job().start())