Compare commits
5 Commits
19d5d1e3eb
...
master
Author | SHA1 | Date | |
---|---|---|---|
7dbfcf568b | |||
661a1a98da | |||
f2756506af | |||
d9344b1b4b | |||
83281f3625 |
10
.coveragerc
Normal file
10
.coveragerc
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
[run]
|
||||||
|
source = .
|
||||||
|
branch = true
|
||||||
|
omit =
|
||||||
|
.venv/*
|
||||||
|
|
||||||
|
[report]
|
||||||
|
fail_under = 100
|
||||||
|
show_missing = True
|
||||||
|
skip_covered = True
|
6
.gitignore
vendored
6
.gitignore
vendored
@ -3,4 +3,8 @@
|
|||||||
# PyCharm:
|
# PyCharm:
|
||||||
/.idea/
|
/.idea/
|
||||||
# Distribution / packaging:
|
# Distribution / packaging:
|
||||||
*.egg-info/
|
*.egg-info/
|
||||||
|
# Python cache:
|
||||||
|
__pycache__/
|
||||||
|
# Tests:
|
||||||
|
.coverage
|
3
coverage.sh
Executable file
3
coverage.sh
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
#!/usr/bin/env sh
|
||||||
|
|
||||||
|
coverage erase && coverage run -m unittest discover && coverage report
|
@ -19,7 +19,7 @@ keywords = webscraping, html, markup, dom, scraper, attributes, tags, stocks, fi
|
|||||||
package_dir =
|
package_dir =
|
||||||
= src
|
= src
|
||||||
packages = find:
|
packages = find:
|
||||||
python_requires = >=3.7
|
python_requires = >=3.8
|
||||||
install_requires =
|
install_requires =
|
||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
aiohttp
|
aiohttp
|
||||||
|
@ -8,7 +8,7 @@ from pathlib import Path
|
|||||||
from . import get_all_data, log
|
from . import get_all_data, log
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
async def main() -> None:
|
||||||
parser = ArgumentParser(description="Scrape all stock symbols")
|
parser = ArgumentParser(description="Scrape all stock symbols")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-v', '--verbose',
|
'-v', '--verbose',
|
||||||
@ -29,7 +29,7 @@ def main() -> None:
|
|||||||
if args.verbose:
|
if args.verbose:
|
||||||
log.setLevel(logging.DEBUG)
|
log.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
data = asyncio.run(get_all_data(args.sequential))
|
data = await get_all_data(args.sequential)
|
||||||
|
|
||||||
if args.to_file is None:
|
if args.to_file is None:
|
||||||
csv.writer(sys.stdout).writerows(data)
|
csv.writer(sys.stdout).writerows(data)
|
||||||
@ -39,4 +39,4 @@ def main() -> None:
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
asyncio.run(main())
|
||||||
|
@ -1,12 +1,13 @@
|
|||||||
from unittest import TestCase
|
import logging
|
||||||
from unittest.mock import patch, MagicMock, call
|
from unittest import IsolatedAsyncioTestCase
|
||||||
|
from unittest.mock import patch, MagicMock, AsyncMock, call
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from stocksymbolscraper import scrape
|
from stocksymbolscraper import scrape
|
||||||
|
|
||||||
|
|
||||||
class ScrapeTestCase(TestCase):
|
class ScrapeTestCase(IsolatedAsyncioTestCase):
|
||||||
|
|
||||||
@patch.object(scrape, 'get_single_tr_data')
|
@patch.object(scrape, 'get_single_tr_data')
|
||||||
def test_extract_row_data(self, mock_get_single_tr_data: MagicMock):
|
def test_extract_row_data(self, mock_get_single_tr_data: MagicMock):
|
||||||
@ -53,8 +54,111 @@ class ScrapeTestCase(TestCase):
|
|||||||
output = scrape.get_str_from_td(test_td)
|
output = scrape.get_str_from_td(test_td)
|
||||||
self.assertEqual(expected_output, output)
|
self.assertEqual(expected_output, output)
|
||||||
|
|
||||||
|
@patch.object(scrape, 'ClientSession')
|
||||||
|
async def test_soup_from_url(self, mock_session_cls):
|
||||||
|
test_html = '<b>foo</b>'
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.text = AsyncMock(return_value=test_html)
|
||||||
|
mock_get_return = MagicMock()
|
||||||
|
mock_get_return.__aenter__ = AsyncMock(return_value=mock_response)
|
||||||
|
mock_session_obj = MagicMock()
|
||||||
|
mock_session_obj.get = MagicMock(return_value=mock_get_return)
|
||||||
|
mock_session_cls.return_value = mock_session_obj
|
||||||
|
output = await scrape.soup_from_url('foo')
|
||||||
|
expected_output = BeautifulSoup(test_html, scrape.HTML_PARSER)
|
||||||
|
self.assertEqual(expected_output, output)
|
||||||
|
|
||||||
|
output = await scrape.soup_from_url('foo', mock_session_obj)
|
||||||
|
self.assertEqual(expected_output, output)
|
||||||
|
|
||||||
def test_trs_from_page(self):
|
def test_trs_from_page(self):
|
||||||
# Tested function takes URL as argument (GET request is issued)
|
tr1_text, tr2_text = '<tr>foo</tr>', '<tr>bar</tr>'
|
||||||
# HTML to be parsed could be substituted
|
test_html = f'<div id="marketsindex"><table><tbody>{tr1_text}{tr2_text}</tbody></table></div>'
|
||||||
#
|
test_soup = BeautifulSoup(test_html, scrape.HTML_PARSER)
|
||||||
pass
|
output = scrape.trs_from_page(test_soup)
|
||||||
|
expected_output = test_soup.find_all('tr')
|
||||||
|
self.assertSequenceEqual(expected_output, output)
|
||||||
|
|
||||||
|
logging.disable(logging.CRITICAL)
|
||||||
|
test_html = f'<div id="marketsindex"><table>garbage</table></div>'
|
||||||
|
test_soup = BeautifulSoup(test_html, scrape.HTML_PARSER)
|
||||||
|
with patch.object(scrape, 'open') as mock_open:
|
||||||
|
self.assertRaises(scrape.UnexpectedMarkupError, scrape.trs_from_page, test_soup)
|
||||||
|
mock_open.assert_called_once()
|
||||||
|
mock_open.return_value.__enter__.return_value.write.assert_called_once_with(test_soup.prettify())
|
||||||
|
logging.disable(logging.NOTSET)
|
||||||
|
|
||||||
|
@patch.object(scrape, 'soup_from_url')
|
||||||
|
@patch.object(scrape, 'trs_from_page')
|
||||||
|
@patch.object(scrape, 'extract_row_data')
|
||||||
|
@patch.object(scrape, 'ClientSession')
|
||||||
|
async def test_get_data_from_category(self, mock_session_cls, mock_extract_row_data,
|
||||||
|
mock_trs_from_page, mock_soup_from_url):
|
||||||
|
# We do not pass a session object into the tested function,
|
||||||
|
# so we mock the ClientSession class for the first two tests.
|
||||||
|
mock_session = MagicMock()
|
||||||
|
mock_session_cls.return_value = mock_session
|
||||||
|
mock_soup = MagicMock()
|
||||||
|
mock_soup_from_url.return_value = mock_soup
|
||||||
|
category = 'ßßß'
|
||||||
|
url = f'{scrape.BASE_URL}{category}'
|
||||||
|
|
||||||
|
# First test with no TRs returned, thus not entering the loop.
|
||||||
|
mock_trs = []
|
||||||
|
mock_trs_from_page.return_value = mock_trs
|
||||||
|
mock_extract_row_data.return_value = expected_output = []
|
||||||
|
output = await scrape.get_data_from_category(category)
|
||||||
|
self.assertListEqual(expected_output, output)
|
||||||
|
mock_soup_from_url.assert_called_once_with(url, mock_session)
|
||||||
|
mock_trs_from_page.assert_called_once_with(mock_soup)
|
||||||
|
mock_extract_row_data.assert_not_called()
|
||||||
|
|
||||||
|
mock_soup_from_url.reset_mock()
|
||||||
|
mock_trs_from_page.reset_mock()
|
||||||
|
|
||||||
|
# Second test with (fake) TRs returned, thus entering the loop.
|
||||||
|
# We pass a last_page argument to stop after the first iteration.
|
||||||
|
mock_trs = ['foo', 'bar']
|
||||||
|
mock_trs_from_page.return_value = mock_trs
|
||||||
|
mock_extract_row_data.return_value = expected_output = ['a', 'b']
|
||||||
|
|
||||||
|
# Factored out checks because the only difference in the next two tests is the presence or absence of a
|
||||||
|
# real session instance.
|
||||||
|
def check_assertions(test_output, session_obj):
|
||||||
|
self.assertListEqual(expected_output, test_output)
|
||||||
|
mock_soup_from_url.assert_has_calls([call(url, session_obj), call(f'{url}/{2}', session_obj)])
|
||||||
|
mock_trs_from_page.assert_has_calls([call(mock_soup), call(mock_soup)])
|
||||||
|
mock_extract_row_data.assert_called_once_with(*mock_trs)
|
||||||
|
|
||||||
|
output = await scrape.get_data_from_category(category, last_page=1)
|
||||||
|
check_assertions(output, mock_session)
|
||||||
|
|
||||||
|
mock_soup_from_url.reset_mock()
|
||||||
|
mock_trs_from_page.reset_mock()
|
||||||
|
mock_extract_row_data.reset_mock()
|
||||||
|
|
||||||
|
# Third test with (fake) TRs returned and explicitly passing a real session object.
|
||||||
|
async with scrape.ClientSession() as session:
|
||||||
|
output = await scrape.get_data_from_category(category, session, last_page=1)
|
||||||
|
check_assertions(output, session)
|
||||||
|
|
||||||
|
@patch.object(scrape, 'get_data_from_category')
|
||||||
|
@patch.object(scrape, 'ClientSession')
|
||||||
|
async def test_get_all_data(self, mock_session_cls, mock_get_data_from_category):
|
||||||
|
mock_session = MagicMock()
|
||||||
|
mock_session_cls.return_value.__aenter__.return_value = mock_session
|
||||||
|
mock_result = ['foo']
|
||||||
|
expected_output = len(scrape.CATEGORIES) * mock_result
|
||||||
|
mock_get_data_from_category.return_value = mock_result
|
||||||
|
|
||||||
|
output = await scrape.get_all_data(sequential=True)
|
||||||
|
self.assertListEqual(expected_output, output)
|
||||||
|
mock_get_data_from_category.assert_has_calls([
|
||||||
|
call(category, mock_session) for category in scrape.CATEGORIES
|
||||||
|
])
|
||||||
|
|
||||||
|
output = await scrape.get_all_data(sequential=False)
|
||||||
|
self.assertListEqual(expected_output, output)
|
||||||
|
mock_get_data_from_category.assert_has_calls([
|
||||||
|
call(category, mock_session) for category in scrape.CATEGORIES
|
||||||
|
])
|
||||||
|
Reference in New Issue
Block a user