2021-03-13 16:53:15 +01:00
|
|
|
from argparse import ArgumentParser, SUPPRESS
|
|
|
|
from pathlib import Path
|
|
|
|
from distutils.util import strtobool
|
|
|
|
import asyncio
|
|
|
|
import os
|
|
|
|
|
2021-07-25 14:45:07 +02:00
|
|
|
from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_DEFAULT_NAME
|
2021-03-13 16:53:15 +01:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
parser = ArgumentParser(
|
|
|
|
argument_default=SUPPRESS,
|
|
|
|
description="CLI tool for starting scraping jobs. "
|
|
|
|
"The provided settings always override environment variables and those in the config file. "
|
|
|
|
"For more detailed descriptions of options refer to the example config file."
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'-c', '--config-file',
|
|
|
|
type=Path,
|
2021-07-25 14:45:07 +02:00
|
|
|
default=Path(Path.cwd(), CONFIG_FILE_DEFAULT_NAME),
|
|
|
|
help="Specify a different config file path. Defaults to 'config.yaml' in the current working directory."
|
2021-03-13 16:53:15 +01:00
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'entry_urls',
|
|
|
|
type=str,
|
|
|
|
nargs='*',
|
|
|
|
help="Can be a url, list of urls or path to a text file with urls"
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'-r', '--regex-mode',
|
|
|
|
action='store_true',
|
|
|
|
help="Set this flag to treat all filter strings as regular expressions"
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'-d', '--max-depth',
|
|
|
|
type=int,
|
|
|
|
help="Maximum recursion depth for following matching links to other pages"
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'-p', '--max-pages',
|
|
|
|
type=int,
|
|
|
|
help="Maximum number of pages to visit"
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--output-with-urls',
|
|
|
|
action='store_true',
|
|
|
|
help="Set this flag to map scraped results to the url of the page they were found on"
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'-o', '--output-format',
|
|
|
|
type=str,
|
|
|
|
help="Set to either 'yaml', 'json' or 'simple' format"
|
|
|
|
)
|
|
|
|
kwargs = vars(parser.parse_args())
|
|
|
|
os.environ[CONFIG_FILE_ENV_VAR] = str(kwargs.pop('config_file'))
|
|
|
|
from soupjobs.settings import settings
|
|
|
|
from soupjobs.scrape import Job
|
|
|
|
for key, value in kwargs.items():
|
|
|
|
print(key, value, type(value))
|
|
|
|
setattr(settings, key, value)
|
|
|
|
if not settings.entry_urls:
|
|
|
|
print("No urls specified")
|
|
|
|
exit()
|
|
|
|
if not settings.has_target_filters():
|
|
|
|
warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \
|
|
|
|
"Are you sure you want to proceed? (y/n)"
|
2021-07-25 14:45:07 +02:00
|
|
|
proceed = strtobool(input(warning).lower())
|
2021-03-13 16:53:15 +01:00
|
|
|
if not proceed:
|
|
|
|
print("Cancelled")
|
|
|
|
exit()
|
|
|
|
asyncio.run(Job().start())
|