fixes; another function implemented

This commit is contained in:
Daniil Fajnberg 2021-11-27 15:11:42 +01:00
parent 25be101cf9
commit f2abd8f2ce
2 changed files with 28 additions and 17 deletions

View File

@ -1,5 +1,5 @@
import logging
from typing import Union from typing import Union
import re
from aiohttp.client import ClientSession from aiohttp.client import ClientSession
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -8,6 +8,8 @@ from bs4.element import ResultSet, Tag
from . import constants from . import constants
log = logging.getLogger(__name__)
# The resulting dictionary's keys correspond to the name of the item (row) in the financial statement, # The resulting dictionary's keys correspond to the name of the item (row) in the financial statement,
# while its values will always be tuples with a length corresponding to the number of periods (columns) # while its values will always be tuples with a length corresponding to the number of periods (columns)
# and elements being the actual numbers, with the exception of the first key-value-pair, which will represent # and elements being the actual numbers, with the exception of the first key-value-pair, which will represent
@ -37,21 +39,21 @@ def is_relevant_table_row(tr: Tag) -> bool:
""" """
Returns True if the item in the table row is marked as relevant. Additionally warns when an item is unknown. Returns True if the item in the table row is marked as relevant. Additionally warns when an item is unknown.
""" """
item_name = str(tr.find_next('td').find_next('div').string).strip() if tr.name != 'tr':
if constants.FINANCIAL_STATEMENT_ITEMS[item_name]: return False
return True item_name = str(tr.td.div.string).strip()
try:
return constants.FINANCIAL_STATEMENT_ITEMS[item_name]
except KeyError:
log.warning(f"Unknown item name '{item_name}' found in financial statement.")
return False
def find_relevant_table_rows(soup: BeautifulSoup) -> ResultSet: def find_relevant_table_rows(soup: BeautifulSoup) -> ResultSet:
""" """
Returns the table rows containing the data of interest. Returns the table rows containing the data of interest.
""" """
table_div = soup.find('div', attrs={'class': 'financials'}) return soup.find('div', attrs={'class': 'financials'}).tbody.find_all(is_relevant_table_row)
rows = table_div.find_all('tr')[1:-1]
for r in rows:
if not is_relevant_table_row(r):
rows.remove(r)
return rows
def extract_row_data(tr: Tag) -> tuple[str, tuple[int]]: def extract_row_data(tr: Tag) -> tuple[str, tuple[int]]:

View File

@ -1,3 +1,4 @@
import logging
from pathlib import Path from pathlib import Path
from unittest import IsolatedAsyncioTestCase from unittest import IsolatedAsyncioTestCase
from unittest.mock import patch, MagicMock, AsyncMock, call from unittest.mock import patch, MagicMock, AsyncMock, call
@ -17,6 +18,8 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
# view page source @ line 2055 # view page source @ line 2055
TEST_HTML_FILE_PATH = Path(THIS_DIR, 'test_structure.html') TEST_HTML_FILE_PATH = Path(THIS_DIR, 'test_structure.html')
log_lvl: int
@staticmethod @staticmethod
def get_mock_session(response_text: str = None) -> MagicMock: def get_mock_session(response_text: str = None) -> MagicMock:
mock_response = MagicMock() mock_response = MagicMock()
@ -32,6 +35,12 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
with open(cls.TEST_HTML_FILE_PATH, 'r') as f: with open(cls.TEST_HTML_FILE_PATH, 'r') as f:
test_html = f.read() test_html = f.read()
cls.test_soup = BeautifulSoup(test_html, HTML_PARSER) cls.test_soup = BeautifulSoup(test_html, HTML_PARSER)
cls.log_lvl = functions.log.level
functions.log.setLevel(logging.CRITICAL)
@classmethod
def tearDownClass(cls) -> None:
functions.log.setLevel(cls.log_lvl)
@patch.object(functions, 'ClientSession') @patch.object(functions, 'ClientSession')
async def test_soup_from_url(self, mock_session_cls): async def test_soup_from_url(self, mock_session_cls):
@ -47,21 +56,21 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
self.assertTupleEqual(expected_output, output) self.assertTupleEqual(expected_output, output)
def test_is_relevant_table_row(self): def test_is_relevant_table_row(self):
test_html = '<tr><td><div> Cash & Short Term Investments </div></td></tr>' test_soup = BeautifulSoup('<tr><td><div> Cash & Short Term Investments </div></td></tr>', HTML_PARSER)
test_soup = BeautifulSoup(test_html, HTML_PARSER)
self.assertTrue(functions.is_relevant_table_row(test_soup.tr)) self.assertTrue(functions.is_relevant_table_row(test_soup.tr))
test_html = '<tr><td><div> Cash & Short Term Investments Growth </div></td></tr>' test_soup = BeautifulSoup('<tr><td><div> Cash & Short Term Investments Growth </div></td></tr>', HTML_PARSER)
test_soup = BeautifulSoup(test_html, HTML_PARSER)
self.assertFalse(functions.is_relevant_table_row(test_soup.tr)) self.assertFalse(functions.is_relevant_table_row(test_soup.tr))
test_soup = BeautifulSoup('<tr><td><div> baz </div></td></tr>', HTML_PARSER)
self.assertFalse(functions.is_relevant_table_row(test_soup.tr))
self.assertFalse(functions.is_relevant_table_row(test_soup.div))
@patch.object(functions, 'is_relevant_table_row') @patch.object(functions, 'is_relevant_table_row')
def test_find_relevant_table_rows(self, mock_is_relevant_table_row): def test_find_relevant_table_rows(self, mock_is_relevant_table_row):
mock_is_relevant_table_row.return_value = True mock_is_relevant_table_row.return_value = True
test_table = self.test_soup.find('div', attrs={'class': 'financials'}) expected_output = self.test_soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr')
expected_output = test_table.find_all('tr')[1:-1]
output = functions.find_relevant_table_rows(self.test_soup) output = functions.find_relevant_table_rows(self.test_soup)
self.assertListEqual(expected_output, output) self.assertListEqual(expected_output, output)
mock_is_relevant_table_row.assert_called_once_with(expected_output[0]) mock_is_relevant_table_row.assert_has_calls([call(expected_output[0]), call(expected_output[1])])
def test_extract_row_data(self): def test_extract_row_data(self):
test_table = self.test_soup.find('div', attrs={'class': 'financials'}).div.div.table test_table = self.test_soup.find('div', attrs={'class': 'financials'}).div.div.table