minor refactoring & adjustments

This commit is contained in:
Maximilian Fajnberg 2021-12-26 17:55:34 +01:00
parent 38dd29f35b
commit c93fb9692e
5 changed files with 19 additions and 260 deletions

View File

@ -1,4 +1,3 @@
DEV_MODE = False
MAIN_LOGGER_NAME = 'mwfin' MAIN_LOGGER_NAME = 'mwfin'
HTML_PARSER = 'html.parser' HTML_PARSER = 'html.parser'
@ -13,213 +12,3 @@ FIN_STMT_URL_SUFFIX = {
CF: '/cash-flow' CF: '/cash-flow'
} }
END_DATE = 'End Date' END_DATE = 'End Date'
# All items marked `False` do not need to be scraped
# because they are calculated from other items (e.g. growth or ratios).
FIN_STMT_ITEMS = {
#################
# Balance Sheet #
#################
"Cash & Short Term Investments": True,
"Cash & Short Term Investments Growth": False,
"Cash Only": True,
"Short-Term Investments": True,
"Cash & ST Investments / Total Assets": False,
"Total Accounts Receivable": True,
"Total Accounts Receivable Growth": False,
"Accounts Receivables, Net": True,
"Accounts Receivables, Gross": True,
"Bad Debt/Doubtful Accounts": True,
"Other Receivable": True,
"Accounts Receivable Turnover": False,
"Inventories": True,
"Finished Goods": True,
"Work in Progress": True,
"Raw Materials": True,
"Progress Payments & Other": True,
"Other Current Assets": True,
"Miscellaneous Current Assets": True,
"Total Current Assets": True,
"Net Property, Plant & Equipment": True,
"Property, Plant & Equipment - Gross": True,
"Buildings": True,
"Land & Improvements": True,
"Computer Software and Equipment": True,
"Other Property, Plant & Equipment": True,
"Accumulated Depreciation": True,
"Total Investments and Advances": True,
"Other Long-Term Investments": True,
"Long-Term Note Receivables": True,
"Intangible Assets": True,
"Net Goodwill": True,
"Net Other Intangibles": True,
"Other Assets": True,
"Total Assets": True,
"Total Assets Growth": False,
"ST Debt & Current Portion LT Debt": True,
"Short Term Debt": True,
"Current Portion of Long Term Debt": True,
"Accounts Payable": True,
"Accounts Payable Growth": False,
"Income Tax Payable": True,
"Other Current Liabilities": True,
"Dividends Payable": True,
"Accrued Payroll": True,
"Miscellaneous Current Liabilities": True,
"Total Current Liabilities": True,
"Long-Term Debt": True,
"Long-Term Debt excl. Capitalized Leases": True,
"Non-Convertible Debt": True,
"Convertible Debt": True,
"Capitalized Lease Obligations": True,
"Provision for Risks & Charges": True,
"Deferred Taxes": True,
"Deferred Taxes - Credits": True,
"Deferred Taxes - Debit": True,
"Other Liabilities": True,
"Other Liabilities (excl. Deferred Income)": True,
"Deferred Income": True,
"Total Liabilities": True,
"Non-Equity Reserves": True,
"Total Liabilities / Total Assets": False,
"Preferred Stock (Carrying Value)": True,
"Redeemable Preferred Stock": True,
"Non-Redeemable Preferred Stock": True,
"Common Equity (Total)": True,
"Common Equity / Total Assets": False,
"Common Stock Par/Carry Value": True,
"Retained Earnings": True,
"ESOP Debt Guarantee": True,
"Cumulative Translation Adjustment/Unrealized For. Exch. Gain": True,
"Unrealized Gain/Loss Marketable Securities": True,
"Revaluation Reserves": True,
"Treasury Stock": True,
"Total Shareholders' Equity": True,
"Total Shareholders' Equity / Total Assets": False,
"Accumulated Minority Interest": True,
"Total Equity": True,
"Liabilities & Shareholders' Equity": True,
####################
# Income Statement #
####################
"Sales/Revenue": True,
"Sales Growth": False,
"Cost of Goods Sold (COGS) incl. D&A": True,
"COGS Growth": False,
"COGS excluding D&A": True,
"Depreciation & Amortization Expense": True,
"Depreciation": True,
"Amortization of Intangibles": True,
"Gross Income": True,
"Gross Income Growth": False,
"Gross Profit Margin": False,
"SG&A Expense": True,
"SGA Growth": False,
"Research & Development": True,
"Other SG&A": True,
"Other Operating Expense": True,
"Unusual Expense": True,
"EBIT after Unusual Expense": True,
"Non Operating Income/Expense": True,
"Non-Operating Interest Income": True,
"Equity in Affiliates (Pretax)": True,
"Interest Expense": True,
"Interest Expense Growth": False,
"Gross Interest Expense": True,
"Interest Capitalized": True,
"Pretax Income": True,
"Pretax Income Growth": False,
"Pretax Margin": False,
"Income Tax": True,
"Income Tax - Current Domestic": True,
"Income Tax - Current Foreign": True,
"Income Tax - Deferred Domestic": True,
"Income Tax - Deferred Foreign": True,
"Income Tax Credits": True,
"Equity in Affiliates": True,
"Other After Tax Income (Expense)": True,
"Consolidated Net Income": True,
"Minority Interest Expense": True,
"Net Income": True,
"Net Income Growth": False,
"Net Margin Growth": False,
"Extraordinaries & Discontinued Operations": True,
"Extra Items & Gain/Loss Sale Of Assets": True,
"Cumulative Effect - Accounting Chg": True,
"Discontinued Operations": True,
"Net Income After Extraordinaries": True,
"Preferred Dividends": True,
"Net Income Available to Common": True,
"EPS (Basic)": True,
"EPS (Basic) Growth": False,
"Basic Shares Outstanding": True,
"EPS (Diluted)": True,
"EPS (Diluted) Growth": False,
"Diluted Shares Outstanding": True,
"EBITDA": True,
"EBITDA Growth": False,
"EBITDA Margin": False,
#######################
# Cash Flow Statement #
#######################
"Net Income before Extraordinaries": True,
# "Net Income Growth": False,
"Depreciation, Depletion & Amortization": True,
"Depreciation and Depletion": True,
"Amortization of Intangible Assets": True,
"Deferred Taxes & Investment Tax Credit": True,
# "Deferred Taxes": True,
"Investment Tax Credit": True,
"Other Funds": True,
"Funds from Operations": True,
"Extraordinaries": True,
"Changes in Working Capital": True,
"Receivables": True,
# "Accounts Payable": True,
"Other Assets/Liabilities": True,
"Net Operating Cash Flow": True,
"Net Operating Cash Flow Growth": False,
"Net Operating Cash Flow / Sales": False,
"Capital Expenditures": True,
"Capital Expenditures Growth": False,
"Capital Expenditures / Sales": False,
"Capital Expenditures (Fixed Assets)": True,
"Capital Expenditures (Other Assets)": True,
"Net Assets from Acquisitions": True,
"Sale of Fixed Assets & Businesses": True,
"Purchase/Sale of Investments": True,
"Purchase of Investments": True,
"Sale/Maturity of Investments": True,
"Other Uses": True,
"Other Sources": True,
"Net Investing Cash Flow": True,
"Net Investing Cash Flow Growth": False,
"Net Investing Cash Flow / Sales": False,
"Cash Dividends Paid - Total": True,
"Common Dividends": True,
# "Preferred Dividends": True,
"Change in Capital Stock": True,
"Repurchase of Common & Preferred Stk.": True,
"Sale of Common & Preferred Stock": True,
"Proceeds from Stock Options": True,
"Other Proceeds from Sale of Stock": True,
"Issuance/Reduction of Debt, Net": True,
"Change in Current Debt": True,
"Change in Long-Term Debt": True,
"Issuance of Long-Term Debt": True,
"Reduction in Long-Term Debt": True,
# "Other Funds": True,
# "Other Uses": True,
# "Other Sources": True,
"Net Financing Cash Flow": True,
"Net Financing Cash Flow Growth": False,
"Net Financing Cash Flow / Sales": False,
"Exchange Rate Effect": True,
"Miscellaneous Funds": True,
"Net Change in Cash": True,
"Free Cash Flow": True,
"Free Cash Flow Growth": False,
"Free Cash Flow Yield": False,
}

View File

@ -4,7 +4,3 @@ class WrongAssumptions(Exception):
class UnexpectedMarkup(WrongAssumptions): class UnexpectedMarkup(WrongAssumptions):
pass pass
class UnknownFinancialStatementItem(WrongAssumptions):
pass

View File

@ -1,16 +1,13 @@
import logging import logging
import asyncio import asyncio
from typing import Union, List, Dict from typing import Union, List, Dict
from datetime import datetime
from aiohttp.client import ClientSession from aiohttp.client import ClientSession
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from webutils import in_async_session, gather_in_batches from webutils import in_async_session, gather_in_batches
from .constants import (DEV_MODE, HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, FIN_STMT_ITEMS, from .constants import HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, DEFAULT_CONCURRENT_BATCH_SIZE
DEFAULT_CONCURRENT_BATCH_SIZE)
from .exceptions import UnknownFinancialStatementItem
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -19,7 +16,7 @@ log = logging.getLogger(__name__)
# while its values will always be tuples with a length corresponding to the number of periods (columns) # while its values will always be tuples with a length corresponding to the number of periods (columns)
# and elements being the actual numbers, with the exception of the first key-value-pair, which will represent # and elements being the actual numbers, with the exception of the first key-value-pair, which will represent
# the end dates of the reporting periods as strings (either years or quarters). # the end dates of the reporting periods as strings (either years or quarters).
ResultDict = dict[str, Union[tuple[int], tuple[str]]] ResultDict = dict[str, Union[tuple[float], tuple[str]]]
@in_async_session @in_async_session
@ -41,36 +38,14 @@ def extract_end_dates(soup: BeautifulSoup) -> tuple[str]:
return tuple(str(th.string).strip() for th in ths[1:-1]) return tuple(str(th.string).strip() for th in ths[1:-1])
def is_relevant_table_row(tr: Tag) -> bool: def get_all_table_rows(soup: BeautifulSoup) -> List[Tag]:
"""
Returns True if the item in the table row is marked as relevant. Additionally warns when an item is unknown.
"""
item_name = str(tr.td.div.string).strip()
try:
return FIN_STMT_ITEMS[item_name]
except KeyError:
log.warning(f"Unknown item name '{item_name}' found in financial statement.")
raise UnknownFinancialStatementItem
def find_relevant_table_rows(soup: BeautifulSoup) -> List[Tag]:
""" """
Returns the table rows containing the data of interest. Returns the table rows containing the data of interest.
""" """
now = datetime.utcnow() return soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr')
trs = []
for tr in soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr'):
try:
if is_relevant_table_row(tr):
trs.append(tr)
except UnknownFinancialStatementItem:
if DEV_MODE:
with open(f'mwfin_unknown_items_{now.strftime("%Y-%m-%d_%H-%M-%S")}.html', 'w') as f:
f.write(str(soup))
return trs
def extract_row_data(tr: Tag) -> tuple[str, tuple[int]]: def extract_row_data(tr: Tag) -> tuple[str, tuple[float]]:
""" """
Returns the name of the item displayed in the table row (of a financial statement) Returns the name of the item displayed in the table row (of a financial statement)
as well as a number for each reporting period. as well as a number for each reporting period.
@ -78,7 +53,7 @@ def extract_row_data(tr: Tag) -> tuple[str, tuple[int]]:
item_name = str(tr.td.div.string).strip() item_name = str(tr.td.div.string).strip()
data_div = tr.find_all('td')[-1].div.div data_div = tr.find_all('td')[-1].div.div
values_str: str = data_div.attrs['data-chart-data'] values_str: str = data_div.attrs['data-chart-data']
values = tuple(int(float(s if s != '' else 0)) for s in values_str.split(',')) values = tuple(float(s if s != '' else 0) for s in values_str.split(','))
return item_name, values return item_name, values
@ -87,7 +62,7 @@ def extract_all_data(soup: BeautifulSoup) -> ResultDict:
Extracts financials from the page. Extracts financials from the page.
""" """
output = {END_DATE: extract_end_dates(soup)} output = {END_DATE: extract_end_dates(soup)}
for row in find_relevant_table_rows(soup): for row in get_all_table_rows(soup):
row_data = extract_row_data(row) row_data = extract_row_data(row)
output[row_data[0]] = row_data[1] output[row_data[0]] = row_data[1]
return output return output

View File

@ -7,7 +7,6 @@ from bs4 import BeautifulSoup
from mwfin import functions from mwfin import functions
from mwfin.constants import HTML_PARSER, BASE_URL, FIN_STMT_URL_SUFFIX, IS, BS, CF, END_DATE from mwfin.constants import HTML_PARSER, BASE_URL, FIN_STMT_URL_SUFFIX, IS, BS, CF, END_DATE
from mwfin.exceptions import UnknownFinancialStatementItem
THIS_DIR = Path(__file__).parent THIS_DIR = Path(__file__).parent
@ -64,18 +63,18 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
def test_extract_row_data(self): def test_extract_row_data(self):
test_row = self.test_soup.find('div', attrs={'class': 'financials'}).tbody.tr test_row = self.test_soup.find('div', attrs={'class': 'financials'}).tbody.tr
expected_output = ('Cash & Short Term Investments', (11000000, -22000000)) expected_output = ('foo', (1., -2.))
output = functions.extract_row_data(test_row) output = functions.extract_row_data(test_row)
self.assertTupleEqual(expected_output, output) self.assertTupleEqual(expected_output, output)
@patch.object(functions, 'extract_row_data') @patch.object(functions, 'extract_row_data')
@patch.object(functions, 'find_relevant_table_rows') @patch.object(functions, 'get_all_table_rows')
@patch.object(functions, 'extract_end_dates') @patch.object(functions, 'extract_end_dates')
def test_extract_all_data(self, mock_extract_end_dates, mock_find_relevant_table_rows, mock_extract_row_data): def test_extract_all_data(self, mock_extract_end_dates, mock_get_all_table_rows, mock_extract_row_data):
test_end_dates = ('foo', 'bar') test_end_dates = ('foo', 'bar')
mock_extract_end_dates.return_value = test_end_dates mock_extract_end_dates.return_value = test_end_dates
test_relevant_rows = ['tr1', 'tr2'] test_relevant_rows = ['tr1', 'tr2']
mock_find_relevant_table_rows.return_value = test_relevant_rows mock_get_all_table_rows.return_value = test_relevant_rows
test_row_data = ('item_name', (123, 456)) test_row_data = ('item_name', (123, 456))
mock_extract_row_data.return_value = test_row_data mock_extract_row_data.return_value = test_row_data
expected_output = { expected_output = {
@ -86,7 +85,7 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
output = functions.extract_all_data(self.test_soup) output = functions.extract_all_data(self.test_soup)
self.assertDictEqual(expected_output, output) self.assertDictEqual(expected_output, output)
mock_extract_end_dates.assert_called_once_with(self.test_soup) mock_extract_end_dates.assert_called_once_with(self.test_soup)
mock_find_relevant_table_rows.assert_called_once_with(self.test_soup) mock_get_all_table_rows.assert_called_once_with(self.test_soup)
mock_extract_row_data.assert_has_calls([call(test_relevant_rows[0]), call(test_relevant_rows[1])]) mock_extract_row_data.assert_has_calls([call(test_relevant_rows[0]), call(test_relevant_rows[1])])
@patch.object(functions, 'extract_all_data') @patch.object(functions, 'extract_all_data')
@ -198,9 +197,9 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
symbol = 'foo' symbol = 'foo'
# Since the web request is mocked we always receive the same HTML markup. # Since the web request is mocked we always receive the same HTML markup.
expected_output = { expected_output = {
BS: {END_DATE: ('End_Date_1', 'End_Date_2'), 'Cash & Short Term Investments': (11000000, -22000000)}, BS: {END_DATE: ('End_Date_1', 'End_Date_2'), 'foo': (1., -2.), 'bar': (2., -3.)},
IS: {END_DATE: ('End_Date_1', 'End_Date_2'), 'Cash & Short Term Investments': (11000000, -22000000)}, IS: {END_DATE: ('End_Date_1', 'End_Date_2'), 'foo': (1., -2.), 'bar': (2., -3.)},
CF: {END_DATE: ('End_Date_1', 'End_Date_2'), 'Cash & Short Term Investments': (11000000, -22000000)} CF: {END_DATE: ('End_Date_1', 'End_Date_2'), 'foo': (1., -2.), 'bar': (2., -3.)}
} }
output = await functions.get_all_financials(symbol, session=mock_session_obj) output = await functions.get_all_financials(symbol, session=mock_session_obj)
self.assertDictEqual(expected_output, output) self.assertDictEqual(expected_output, output)

View File

@ -23,16 +23,16 @@
</thead> </thead>
<tbody> <tbody>
<tr> <tr>
<td><div> Cash & Short Term Investments </div><div> Cash & Short Term Investments </div></td> <td><div> foo </div><div> foo </div></td>
<td></td> <td></td>
<td></td> <td></td>
<td><div> <div data-chart-data="11000000.0,-22000000.0"><div></div></td> <td><div> <div data-chart-data="1.0,-2.0"><div></div></td>
</tr> </tr>
<tr> <tr>
<td><div> Cash & Short Term Investments Growth </div><div> Cash & Short Term Investments Growth </div></td> <td><div> bar </div><div> bar </div></td>
<td></td>
<td></td> <td></td>
<td></td> <td></td>
<td><div> <div data-chart-data="2.0,-3.0"><div></div></td>
</tr> </tr>
</tbody> </tbody>
</table> </table>