from argparse import ArgumentParser, SUPPRESS from pathlib import Path from distutils.util import strtobool import asyncio import os from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_DEFAULT_NAME if __name__ == '__main__': parser = ArgumentParser( argument_default=SUPPRESS, description="CLI tool for starting scraping jobs. " "The provided settings always override environment variables and those in the config file. " "For more detailed descriptions of options refer to the example config file." ) parser.add_argument( '-c', '--config-file', type=Path, default=Path(Path.cwd(), CONFIG_FILE_DEFAULT_NAME), help="Specify a different config file path. Defaults to 'config.yaml' in the current working directory." ) parser.add_argument( 'entry_urls', type=str, nargs='*', help="Can be a url, list of urls or path to a text file with urls" ) parser.add_argument( '-r', '--regex-mode', action='store_true', help="Set this flag to treat all filter strings as regular expressions" ) parser.add_argument( '-d', '--max-depth', type=int, help="Maximum recursion depth for following matching links to other pages" ) parser.add_argument( '-p', '--max-pages', type=int, help="Maximum number of pages to visit" ) parser.add_argument( '--output-with-urls', action='store_true', help="Set this flag to map scraped results to the url of the page they were found on" ) parser.add_argument( '-o', '--output-format', type=str, help="Set to either 'yaml', 'json' or 'simple' format" ) kwargs = vars(parser.parse_args()) os.environ[CONFIG_FILE_ENV_VAR] = str(kwargs.pop('config_file')) from soupjobs.settings import settings from soupjobs.scrape import Job for key, value in kwargs.items(): print(key, value, type(value)) setattr(settings, key, value) if not settings.entry_urls: print("No urls specified") exit() if not settings.has_target_filters(): warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \ "Are you sure you want to proceed? (y/n)" proceed = strtobool(input(warning).lower()) if not proceed: print("Cancelled") exit() asyncio.run(Job().start())