diff --git a/src/mwfin/constants.py b/src/mwfin/constants.py index 34ab031..ca1c3b2 100644 --- a/src/mwfin/constants.py +++ b/src/mwfin/constants.py @@ -11,4 +11,9 @@ FIN_STMT_URL_SUFFIX = { IS: '', CF: '/cash-flow' } +INDENT_MAP = { + 'indent--small': 1, + 'indent--medium': 2, + 'indent--large': 3, +} END_DATE = 'End Date' diff --git a/src/mwfin/functions.py b/src/mwfin/functions.py index 4c28ac6..a126c4e 100644 --- a/src/mwfin/functions.py +++ b/src/mwfin/functions.py @@ -1,13 +1,14 @@ import logging import asyncio -from typing import Union, List, Dict +from typing import Union, Tuple, List, Dict from aiohttp.client import ClientSession from bs4 import BeautifulSoup from bs4.element import Tag from webutils import in_async_session, gather_in_batches -from .constants import HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, DEFAULT_CONCURRENT_BATCH_SIZE +from .constants import (HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, DEFAULT_CONCURRENT_BATCH_SIZE, + INDENT_MAP) log = logging.getLogger(__name__) @@ -16,7 +17,9 @@ log = logging.getLogger(__name__) # while its values will always be tuples with a length corresponding to the number of periods (columns) # and elements being the actual numbers, with the exception of the first key-value-pair, which will represent # the end dates of the reporting periods as strings (either years or quarters). -ResultDict = dict[str, Union[tuple[float], tuple[str]]] +HeaderData = Tuple[int, str, str, str, str, str] +RowData = Tuple[int, float, float, float, float, float] +ResultDict = dict[str, Union[HeaderData, RowData]] @in_async_session @@ -29,13 +32,25 @@ async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSou return BeautifulSoup(html, HTML_PARSER) -def extract_end_dates(soup: BeautifulSoup) -> tuple[str]: +def get_row_indent(tr: Tag) -> int: + try: + classes = tr.div.attrs['class'] + except KeyError: + return 0 + for class_name, indent in INDENT_MAP.items(): + if class_name in classes: + return indent + return 0 + + +def extract_end_dates(soup: BeautifulSoup) -> HeaderData: """ Finds and returns the end dates of the reporting periods as strings (either years or quarters) from the page of a financial statement. """ - ths = soup.find('div', attrs={'class': 'financials'}).thead.find_all('th') - return tuple(str(th.string).strip() for th in ths[1:-1]) + tr = soup.find('div', attrs={'class': 'financials'}).thead.tr + ths = tr.find_all('th') + return (get_row_indent(tr), ) + tuple(str(th.string).strip() for th in ths[1:-1]) def get_all_table_rows(soup: BeautifulSoup) -> List[Tag]: @@ -45,7 +60,7 @@ def get_all_table_rows(soup: BeautifulSoup) -> List[Tag]: return soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr') -def extract_row_data(tr: Tag) -> tuple[str, tuple[float]]: +def extract_row_data(tr: Tag) -> Tuple[str, RowData]: """ Returns the name of the item displayed in the table row (of a financial statement) as well as a number for each reporting period. @@ -54,7 +69,7 @@ def extract_row_data(tr: Tag) -> tuple[str, tuple[float]]: data_div = tr.find_all('td')[-1].div.div values_str: str = data_div.attrs['data-chart-data'] values = tuple(float(s if s != '' else 0) for s in values_str.split(',')) - return item_name, values + return item_name, (get_row_indent(tr), ) + values def extract_all_data(soup: BeautifulSoup) -> ResultDict: diff --git a/tests/test_functions.py b/tests/test_functions.py index 588e82d..8b24758 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -213,19 +213,19 @@ class FunctionsTestCase(IsolatedAsyncioTestCase): # Since the web request is mocked we always receive the same HTML markup. expected_output = { BS: { - END_DATE: ('End_Date_1', 'End_Date_2'), + END_DATE: (0, 'End_Date_1', 'End_Date_2'), 'foo': (1, 1., -2.), 'bar': (2, 2., -3.), 'baz': (3, 3., -4.) }, IS: { - END_DATE: ('End_Date_1', 'End_Date_2'), + END_DATE: (0, 'End_Date_1', 'End_Date_2'), 'foo': (1, 1., -2.), 'bar': (2, 2., -3.), 'baz': (3, 3., -4.) }, CF: { - END_DATE: ('End_Date_1', 'End_Date_2'), + END_DATE: (0, 'End_Date_1', 'End_Date_2'), 'foo': (1, 1., -2.), 'bar': (2, 2., -3.), 'baz': (3, 3., -4.)