diff --git a/.gitignore b/.gitignore index fe4ca33..605b04b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +config.yaml # Python virtual environment: /.venv/ # pipenv lock-file and cache directory: diff --git a/Pipfile b/Pipfile index f2797b1..55ad154 100644 --- a/Pipfile +++ b/Pipfile @@ -6,6 +6,8 @@ name = "pypi" [packages] beautifulsoup4 = "*" aiohttp = "*" +pydantic = "*" +pyyaml = "*" [dev-packages] diff --git a/setup.py b/setup.py index 1414157..67fde8e 100644 --- a/setup.py +++ b/setup.py @@ -14,9 +14,12 @@ setuptools.setup( # url="https://github.com/...", package_dir={'': 'src'}, packages=setuptools.find_packages(where='src'), + package_data={'soupjobs': ['example.config.yaml']}, install_requires=[ 'aiohttp', - 'beautifulsoup4' + 'beautifulsoup4', + 'pydantic', + 'pyyaml', ], extras_require={ 'tests': ['coverage'], diff --git a/src/soupjobs/__init__.py b/src/soupjobs/__init__.py new file mode 100644 index 0000000..9c9ee07 --- /dev/null +++ b/src/soupjobs/__init__.py @@ -0,0 +1,2 @@ +CONFIG_FILE_ENV_VAR = 'SOUPJOBS_CONFIG' +CONFIG_FILE_PLACEHOLDER = 'placeholder' diff --git a/src/soupjobs/example.config.yaml b/src/soupjobs/example.config.yaml new file mode 100644 index 0000000..deb3774 --- /dev/null +++ b/src/soupjobs/example.config.yaml @@ -0,0 +1,124 @@ +############################ +# Scrape job configuration # +############################ + +# All possible config parameters are explained here with either the default or an example value provided with them. + +# General section +################# + +# If specified as a list, the elements are interpreted as urls to visit and scrape; +# if specified as a string that has valid url format, the corresponding page is visited and scraped; +# otherwise the string is assumed to be a path to a text file with a url on every line to be visited and scraped. +entry_urls: + - https://en.wikipedia.org/wiki/Python_(programming_language) + - https://en.wikipedia.org/wiki/Guido_van_Rossum + +# If `True`, all filter checks are performed by matching an HTML tag's attributes using the provided arguments as +# regular expressions to match the attribute values, otherwise they are simply checked for string equality. +# Default: +#regex_mode: False +regex_mode: True + +# The maximum recursion depth for following matching links to other pages starting from the `entry_urls`. +# For example, a `max_depth` of 2 means that for every entry page the matching links may be followed, +# and then for every one of those, the matching links may be followed again, but then the job would end. +# Default: +#max_depth: 0 +max_depth: 2 + +# Maximum number of pages to visit. +# Example: +#max_pages: 100 + +# If `True` the output will be a mapping with the visited urls as keys and lists of the extracted matches as values; +# otherwise the output is merely a list of all extracted matches. +# Default: +#output_with_urls: False + +# Output can be produced in either 'yaml', 'json' or 'simple' format. +# The latter will simply print the extracted targets line by line if `output_with_urls` is `False` and +# add the url of the page they were extracted from in an additional line before them if that setting is `True`. +# Default: +#output_format: simple + + +# Target section +# ############## + +# The following section is used to specify filter criteria for the target HTML tags to extract from a page. +# Only HTML tags matching *all* of the specified filters will be extracted. +target: + # Filter by HTML tag + # Example to only look for