soupjobs/src/soupjobs/run.py

from argparse import ArgumentParser, SUPPRESS
from pathlib import Path
from distutils.util import strtobool
import asyncio
import os

from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_DEFAULT_NAME


if __name__ == '__main__':
    parser = ArgumentParser(
        argument_default=SUPPRESS,
        description="CLI tool for starting scraping jobs. "
                    "The provided settings always override environment variables and those in the config file. "
                    "For more detailed descriptions of options refer to the example config file."
    )
    parser.add_argument(
        '-c', '--config-file',
        type=Path,
        default=Path(Path.cwd(), CONFIG_FILE_DEFAULT_NAME),
        help="Specify a different config file path. Defaults to 'config.yaml' in the current working directory."
    )
    parser.add_argument(
        'entry_urls',
        type=str,
        nargs='*',
        help="Can be a url, list of urls or path to a text file with urls"
    )
    parser.add_argument(
        '-r', '--regex-mode',
        action='store_true',
        help="Set this flag to treat all filter strings as regular expressions"
    )
    parser.add_argument(
        '-d', '--max-depth',
        type=int,
        help="Maximum recursion depth for following matching links to other pages"
    )
    parser.add_argument(
        '-p', '--max-pages',
        type=int,
        help="Maximum number of pages to visit"
    )
    parser.add_argument(
        '--output-with-urls',
        action='store_true',
        help="Set this flag to map scraped results to the url of the page they were found on"
    )
    parser.add_argument(
        '-o', '--output-format',
        type=str,
        help="Set to either 'yaml', 'json' or 'simple' format"
    )
    kwargs = vars(parser.parse_args())
    os.environ[CONFIG_FILE_ENV_VAR] = str(kwargs.pop('config_file'))
    from soupjobs.settings import settings
    from soupjobs.scrape import Job
    for key, value in kwargs.items():
        print(key, value, type(value))
        setattr(settings, key, value)
    if not settings.entry_urls:
        print("No urls specified")
        exit()
    if not settings.has_target_filters():
        warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \
                  "Are you sure you want to proceed? (y/n)"
        proceed = strtobool(input(warning).lower())
        if not proceed:
            print("Cancelled")
            exit()
    asyncio.run(Job().start())
first working draft 2021-03-13 16:53:15 +01:00			`from argparse import ArgumentParser, SUPPRESS`
			`from pathlib import Path`
			`from distutils.util import strtobool`
			`import asyncio`
			`import os`

minor optimizations 2021-07-25 14:45:07 +02:00			`from soupjobs import CONFIG_FILE_ENV_VAR, CONFIG_FILE_DEFAULT_NAME`
first working draft 2021-03-13 16:53:15 +01:00

			`if __name__ == '__main__':`
			`parser = ArgumentParser(`
			`argument_default=SUPPRESS,`
			`description="CLI tool for starting scraping jobs. "`
			`"The provided settings always override environment variables and those in the config file. "`
			`"For more detailed descriptions of options refer to the example config file."`
			`)`
			`parser.add_argument(`
			`'-c', '--config-file',`
			`type=Path,`
minor optimizations 2021-07-25 14:45:07 +02:00			`default=Path(Path.cwd(), CONFIG_FILE_DEFAULT_NAME),`
			`help="Specify a different config file path. Defaults to 'config.yaml' in the current working directory."`
first working draft 2021-03-13 16:53:15 +01:00			`)`
			`parser.add_argument(`
			`'entry_urls',`
			`type=str,`
			`nargs='*',`
			`help="Can be a url, list of urls or path to a text file with urls"`
			`)`
			`parser.add_argument(`
			`'-r', '--regex-mode',`
			`action='store_true',`
			`help="Set this flag to treat all filter strings as regular expressions"`
			`)`
			`parser.add_argument(`
			`'-d', '--max-depth',`
			`type=int,`
			`help="Maximum recursion depth for following matching links to other pages"`
			`)`
			`parser.add_argument(`
			`'-p', '--max-pages',`
			`type=int,`
			`help="Maximum number of pages to visit"`
			`)`
			`parser.add_argument(`
			`'--output-with-urls',`
			`action='store_true',`
			`help="Set this flag to map scraped results to the url of the page they were found on"`
			`)`
			`parser.add_argument(`
			`'-o', '--output-format',`
			`type=str,`
			`help="Set to either 'yaml', 'json' or 'simple' format"`
			`)`
			`kwargs = vars(parser.parse_args())`
			`os.environ[CONFIG_FILE_ENV_VAR] = str(kwargs.pop('config_file'))`
			`from soupjobs.settings import settings`
			`from soupjobs.scrape import Job`
			`for key, value in kwargs.items():`
			`print(key, value, type(value))`
			`setattr(settings, key, value)`
			`if not settings.entry_urls:`
			`print("No urls specified")`
			`exit()`
			`if not settings.has_target_filters():`
			`warning = "Warning: No filters were set for target tags to scrape. This may return a LOT of data. " \`
			`"Are you sure you want to proceed? (y/n)"`
minor optimizations 2021-07-25 14:45:07 +02:00			`proceed = strtobool(input(warning).lower())`
first working draft 2021-03-13 16:53:15 +01:00			`if not proceed:`
			`print("Cancelled")`
			`exit()`
			`asyncio.run(Job().start())`