implemented

2021-12-26 19:04:23 +01:00
parent 8afab2fde8
commit 2b3fb9f7ba
3 changed files with 31 additions and 11 deletions
--- a/src/mwfin/constants.py
+++ b/src/mwfin/constants.py
@ -11,4 +11,9 @@ FIN_STMT_URL_SUFFIX = {
    IS: '',
    CF: '/cash-flow'
 }
+INDENT_MAP = {
+    'indent--small': 1,
+    'indent--medium': 2,
+    'indent--large': 3,
+}
 END_DATE = 'End Date'
--- a/src/mwfin/functions.py
+++ b/src/mwfin/functions.py
@ -1,13 +1,14 @@
 import logging
 import asyncio
-from typing import Union, List, Dict
+from typing import Union, Tuple, List, Dict

 from aiohttp.client import ClientSession
 from bs4 import BeautifulSoup
 from bs4.element import Tag
 from webutils import in_async_session, gather_in_batches

-from .constants import HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, DEFAULT_CONCURRENT_BATCH_SIZE
+from .constants import (HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, DEFAULT_CONCURRENT_BATCH_SIZE,
+                        INDENT_MAP)


 log = logging.getLogger(__name__)
@ -16,7 +17,9 @@ log = logging.getLogger(__name__)
 # while its values will always be tuples with a length corresponding to the number of periods (columns)
 # and elements being the actual numbers, with the exception of the first key-value-pair, which will represent
 # the end dates of the reporting periods as strings (either years or quarters).
-ResultDict = dict[str, Union[tuple[float], tuple[str]]]
+HeaderData = Tuple[int, str, str, str, str, str]
+RowData = Tuple[int, float, float, float, float, float]
+ResultDict = dict[str, Union[HeaderData, RowData]]


@in_async_session
@ -29,13 +32,25 @@ async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSou
    return BeautifulSoup(html, HTML_PARSER)


-def extract_end_dates(soup: BeautifulSoup) -> tuple[str]:
+def get_row_indent(tr: Tag) -> int:
+    try:
+        classes = tr.div.attrs['class']
+    except KeyError:
+        return 0
+    for class_name, indent in INDENT_MAP.items():
+        if class_name in classes:
+            return indent
+    return 0
+
+
+def extract_end_dates(soup: BeautifulSoup) -> HeaderData:
    """
    Finds and returns the end dates of the reporting periods as strings (either years or quarters) from the page of a
    financial statement.
    """
-    ths = soup.find('div', attrs={'class': 'financials'}).thead.find_all('th')
-    return tuple(str(th.string).strip() for th in ths[1:-1])
+    tr = soup.find('div', attrs={'class': 'financials'}).thead.tr
+    ths = tr.find_all('th')
+    return (get_row_indent(tr), ) + tuple(str(th.string).strip() for th in ths[1:-1])


 def get_all_table_rows(soup: BeautifulSoup) -> List[Tag]:
@ -45,7 +60,7 @@ def get_all_table_rows(soup: BeautifulSoup) -> List[Tag]:
    return soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr')


-def extract_row_data(tr: Tag) -> tuple[str, tuple[float]]:
+def extract_row_data(tr: Tag) -> Tuple[str, RowData]:
    """
    Returns the name of the item displayed in the table row (of a financial statement)
    as well as a number for each reporting period.
@ -54,7 +69,7 @@ def extract_row_data(tr: Tag) -> tuple[str, tuple[float]]:
    data_div = tr.find_all('td')[-1].div.div
    values_str: str = data_div.attrs['data-chart-data']
    values = tuple(float(s if s != '' else 0) for s in values_str.split(','))
-    return item_name, values
+    return item_name, (get_row_indent(tr), ) + values


 def extract_all_data(soup: BeautifulSoup) -> ResultDict:
--- a/tests/test_functions.py
+++ b/tests/test_functions.py
@ -213,19 +213,19 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
        # Since the web request is mocked we always receive the same HTML markup.
        expected_output = {
            BS: {
-                END_DATE: ('End_Date_1', 'End_Date_2'),
+                END_DATE: (0, 'End_Date_1', 'End_Date_2'),
                'foo': (1, 1., -2.),
                'bar': (2, 2., -3.),
                'baz': (3, 3., -4.)
            },
            IS: {
-                END_DATE: ('End_Date_1', 'End_Date_2'),
+                END_DATE: (0, 'End_Date_1', 'End_Date_2'),
                'foo': (1, 1., -2.),
                'bar': (2, 2., -3.),
                'baz': (3, 3., -4.)
            },
            CF: {
-                END_DATE: ('End_Date_1', 'End_Date_2'),
+                END_DATE: (0, 'End_Date_1', 'End_Date_2'),
                'foo': (1, 1., -2.),
                'bar': (2, 2., -3.),
                'baz': (3, 3., -4.)