implemented

This commit is contained in:
Maximilian Fajnberg 2021-12-26 19:04:23 +01:00
parent 8afab2fde8
commit 2b3fb9f7ba
3 changed files with 31 additions and 11 deletions

View File

@ -11,4 +11,9 @@ FIN_STMT_URL_SUFFIX = {
IS: '',
CF: '/cash-flow'
}
INDENT_MAP = {
'indent--small': 1,
'indent--medium': 2,
'indent--large': 3,
}
END_DATE = 'End Date'

View File

@ -1,13 +1,14 @@
import logging
import asyncio
from typing import Union, List, Dict
from typing import Union, Tuple, List, Dict
from aiohttp.client import ClientSession
from bs4 import BeautifulSoup
from bs4.element import Tag
from webutils import in_async_session, gather_in_batches
from .constants import HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, DEFAULT_CONCURRENT_BATCH_SIZE
from .constants import (HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, DEFAULT_CONCURRENT_BATCH_SIZE,
INDENT_MAP)
log = logging.getLogger(__name__)
@ -16,7 +17,9 @@ log = logging.getLogger(__name__)
# while its values will always be tuples with a length corresponding to the number of periods (columns)
# and elements being the actual numbers, with the exception of the first key-value-pair, which will represent
# the end dates of the reporting periods as strings (either years or quarters).
ResultDict = dict[str, Union[tuple[float], tuple[str]]]
HeaderData = Tuple[int, str, str, str, str, str]
RowData = Tuple[int, float, float, float, float, float]
ResultDict = dict[str, Union[HeaderData, RowData]]
@in_async_session
@ -29,13 +32,25 @@ async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSou
return BeautifulSoup(html, HTML_PARSER)
def extract_end_dates(soup: BeautifulSoup) -> tuple[str]:
def get_row_indent(tr: Tag) -> int:
try:
classes = tr.div.attrs['class']
except KeyError:
return 0
for class_name, indent in INDENT_MAP.items():
if class_name in classes:
return indent
return 0
def extract_end_dates(soup: BeautifulSoup) -> HeaderData:
"""
Finds and returns the end dates of the reporting periods as strings (either years or quarters) from the page of a
financial statement.
"""
ths = soup.find('div', attrs={'class': 'financials'}).thead.find_all('th')
return tuple(str(th.string).strip() for th in ths[1:-1])
tr = soup.find('div', attrs={'class': 'financials'}).thead.tr
ths = tr.find_all('th')
return (get_row_indent(tr), ) + tuple(str(th.string).strip() for th in ths[1:-1])
def get_all_table_rows(soup: BeautifulSoup) -> List[Tag]:
@ -45,7 +60,7 @@ def get_all_table_rows(soup: BeautifulSoup) -> List[Tag]:
return soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr')
def extract_row_data(tr: Tag) -> tuple[str, tuple[float]]:
def extract_row_data(tr: Tag) -> Tuple[str, RowData]:
"""
Returns the name of the item displayed in the table row (of a financial statement)
as well as a number for each reporting period.
@ -54,7 +69,7 @@ def extract_row_data(tr: Tag) -> tuple[str, tuple[float]]:
data_div = tr.find_all('td')[-1].div.div
values_str: str = data_div.attrs['data-chart-data']
values = tuple(float(s if s != '' else 0) for s in values_str.split(','))
return item_name, values
return item_name, (get_row_indent(tr), ) + values
def extract_all_data(soup: BeautifulSoup) -> ResultDict:

View File

@ -213,19 +213,19 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
# Since the web request is mocked we always receive the same HTML markup.
expected_output = {
BS: {
END_DATE: ('End_Date_1', 'End_Date_2'),
END_DATE: (0, 'End_Date_1', 'End_Date_2'),
'foo': (1, 1., -2.),
'bar': (2, 2., -3.),
'baz': (3, 3., -4.)
},
IS: {
END_DATE: ('End_Date_1', 'End_Date_2'),
END_DATE: (0, 'End_Date_1', 'End_Date_2'),
'foo': (1, 1., -2.),
'bar': (2, 2., -3.),
'baz': (3, 3., -4.)
},
CF: {
END_DATE: ('End_Date_1', 'End_Date_2'),
END_DATE: (0, 'End_Date_1', 'End_Date_2'),
'foo': (1, 1., -2.),
'bar': (2, 2., -3.),
'baz': (3, 3., -4.)