From c93fb9692eefeadb16b10be78355aa17ae289d6b Mon Sep 17 00:00:00 2001 From: Maximilian Fajnberg Date: Sun, 26 Dec 2021 17:55:34 +0100 Subject: [PATCH] minor refactoring & adjustments --- src/mwfin/constants.py | 211 -------------------------------------- src/mwfin/exceptions.py | 4 - src/mwfin/functions.py | 39 ++----- tests/test_functions.py | 17 ++- tests/test_structure.html | 8 +- 5 files changed, 19 insertions(+), 260 deletions(-) diff --git a/src/mwfin/constants.py b/src/mwfin/constants.py index 5d778e8..34ab031 100644 --- a/src/mwfin/constants.py +++ b/src/mwfin/constants.py @@ -1,4 +1,3 @@ -DEV_MODE = False MAIN_LOGGER_NAME = 'mwfin' HTML_PARSER = 'html.parser' @@ -13,213 +12,3 @@ FIN_STMT_URL_SUFFIX = { CF: '/cash-flow' } END_DATE = 'End Date' - -# All items marked `False` do not need to be scraped -# because they are calculated from other items (e.g. growth or ratios). -FIN_STMT_ITEMS = { - ################# - # Balance Sheet # - ################# - "Cash & Short Term Investments": True, - "Cash & Short Term Investments Growth": False, - "Cash Only": True, - "Short-Term Investments": True, - "Cash & ST Investments / Total Assets": False, - "Total Accounts Receivable": True, - "Total Accounts Receivable Growth": False, - "Accounts Receivables, Net": True, - "Accounts Receivables, Gross": True, - "Bad Debt/Doubtful Accounts": True, - "Other Receivable": True, - "Accounts Receivable Turnover": False, - "Inventories": True, - "Finished Goods": True, - "Work in Progress": True, - "Raw Materials": True, - "Progress Payments & Other": True, - "Other Current Assets": True, - "Miscellaneous Current Assets": True, - "Total Current Assets": True, - "Net Property, Plant & Equipment": True, - "Property, Plant & Equipment - Gross": True, - "Buildings": True, - "Land & Improvements": True, - "Computer Software and Equipment": True, - "Other Property, Plant & Equipment": True, - "Accumulated Depreciation": True, - "Total Investments and Advances": True, - "Other Long-Term Investments": True, - "Long-Term Note Receivables": True, - "Intangible Assets": True, - "Net Goodwill": True, - "Net Other Intangibles": True, - "Other Assets": True, - "Total Assets": True, - "Total Assets Growth": False, - "ST Debt & Current Portion LT Debt": True, - "Short Term Debt": True, - "Current Portion of Long Term Debt": True, - "Accounts Payable": True, - "Accounts Payable Growth": False, - "Income Tax Payable": True, - "Other Current Liabilities": True, - "Dividends Payable": True, - "Accrued Payroll": True, - "Miscellaneous Current Liabilities": True, - "Total Current Liabilities": True, - "Long-Term Debt": True, - "Long-Term Debt excl. Capitalized Leases": True, - "Non-Convertible Debt": True, - "Convertible Debt": True, - "Capitalized Lease Obligations": True, - "Provision for Risks & Charges": True, - "Deferred Taxes": True, - "Deferred Taxes - Credits": True, - "Deferred Taxes - Debit": True, - "Other Liabilities": True, - "Other Liabilities (excl. Deferred Income)": True, - "Deferred Income": True, - "Total Liabilities": True, - "Non-Equity Reserves": True, - "Total Liabilities / Total Assets": False, - "Preferred Stock (Carrying Value)": True, - "Redeemable Preferred Stock": True, - "Non-Redeemable Preferred Stock": True, - "Common Equity (Total)": True, - "Common Equity / Total Assets": False, - "Common Stock Par/Carry Value": True, - "Retained Earnings": True, - "ESOP Debt Guarantee": True, - "Cumulative Translation Adjustment/Unrealized For. Exch. Gain": True, - "Unrealized Gain/Loss Marketable Securities": True, - "Revaluation Reserves": True, - "Treasury Stock": True, - "Total Shareholders' Equity": True, - "Total Shareholders' Equity / Total Assets": False, - "Accumulated Minority Interest": True, - "Total Equity": True, - "Liabilities & Shareholders' Equity": True, - - #################### - # Income Statement # - #################### - "Sales/Revenue": True, - "Sales Growth": False, - "Cost of Goods Sold (COGS) incl. D&A": True, - "COGS Growth": False, - "COGS excluding D&A": True, - "Depreciation & Amortization Expense": True, - "Depreciation": True, - "Amortization of Intangibles": True, - "Gross Income": True, - "Gross Income Growth": False, - "Gross Profit Margin": False, - "SG&A Expense": True, - "SGA Growth": False, - "Research & Development": True, - "Other SG&A": True, - "Other Operating Expense": True, - "Unusual Expense": True, - "EBIT after Unusual Expense": True, - "Non Operating Income/Expense": True, - "Non-Operating Interest Income": True, - "Equity in Affiliates (Pretax)": True, - "Interest Expense": True, - "Interest Expense Growth": False, - "Gross Interest Expense": True, - "Interest Capitalized": True, - "Pretax Income": True, - "Pretax Income Growth": False, - "Pretax Margin": False, - "Income Tax": True, - "Income Tax - Current Domestic": True, - "Income Tax - Current Foreign": True, - "Income Tax - Deferred Domestic": True, - "Income Tax - Deferred Foreign": True, - "Income Tax Credits": True, - "Equity in Affiliates": True, - "Other After Tax Income (Expense)": True, - "Consolidated Net Income": True, - "Minority Interest Expense": True, - "Net Income": True, - "Net Income Growth": False, - "Net Margin Growth": False, - "Extraordinaries & Discontinued Operations": True, - "Extra Items & Gain/Loss Sale Of Assets": True, - "Cumulative Effect - Accounting Chg": True, - "Discontinued Operations": True, - "Net Income After Extraordinaries": True, - "Preferred Dividends": True, - "Net Income Available to Common": True, - "EPS (Basic)": True, - "EPS (Basic) Growth": False, - "Basic Shares Outstanding": True, - "EPS (Diluted)": True, - "EPS (Diluted) Growth": False, - "Diluted Shares Outstanding": True, - "EBITDA": True, - "EBITDA Growth": False, - "EBITDA Margin": False, - - ####################### - # Cash Flow Statement # - ####################### - "Net Income before Extraordinaries": True, - # "Net Income Growth": False, - "Depreciation, Depletion & Amortization": True, - "Depreciation and Depletion": True, - "Amortization of Intangible Assets": True, - "Deferred Taxes & Investment Tax Credit": True, - # "Deferred Taxes": True, - "Investment Tax Credit": True, - "Other Funds": True, - "Funds from Operations": True, - "Extraordinaries": True, - "Changes in Working Capital": True, - "Receivables": True, - # "Accounts Payable": True, - "Other Assets/Liabilities": True, - "Net Operating Cash Flow": True, - "Net Operating Cash Flow Growth": False, - "Net Operating Cash Flow / Sales": False, - "Capital Expenditures": True, - "Capital Expenditures Growth": False, - "Capital Expenditures / Sales": False, - "Capital Expenditures (Fixed Assets)": True, - "Capital Expenditures (Other Assets)": True, - "Net Assets from Acquisitions": True, - "Sale of Fixed Assets & Businesses": True, - "Purchase/Sale of Investments": True, - "Purchase of Investments": True, - "Sale/Maturity of Investments": True, - "Other Uses": True, - "Other Sources": True, - "Net Investing Cash Flow": True, - "Net Investing Cash Flow Growth": False, - "Net Investing Cash Flow / Sales": False, - "Cash Dividends Paid - Total": True, - "Common Dividends": True, - # "Preferred Dividends": True, - "Change in Capital Stock": True, - "Repurchase of Common & Preferred Stk.": True, - "Sale of Common & Preferred Stock": True, - "Proceeds from Stock Options": True, - "Other Proceeds from Sale of Stock": True, - "Issuance/Reduction of Debt, Net": True, - "Change in Current Debt": True, - "Change in Long-Term Debt": True, - "Issuance of Long-Term Debt": True, - "Reduction in Long-Term Debt": True, - # "Other Funds": True, - # "Other Uses": True, - # "Other Sources": True, - "Net Financing Cash Flow": True, - "Net Financing Cash Flow Growth": False, - "Net Financing Cash Flow / Sales": False, - "Exchange Rate Effect": True, - "Miscellaneous Funds": True, - "Net Change in Cash": True, - "Free Cash Flow": True, - "Free Cash Flow Growth": False, - "Free Cash Flow Yield": False, -} diff --git a/src/mwfin/exceptions.py b/src/mwfin/exceptions.py index 04572f4..8480063 100644 --- a/src/mwfin/exceptions.py +++ b/src/mwfin/exceptions.py @@ -4,7 +4,3 @@ class WrongAssumptions(Exception): class UnexpectedMarkup(WrongAssumptions): pass - - -class UnknownFinancialStatementItem(WrongAssumptions): - pass diff --git a/src/mwfin/functions.py b/src/mwfin/functions.py index 3fce33e..4c28ac6 100644 --- a/src/mwfin/functions.py +++ b/src/mwfin/functions.py @@ -1,16 +1,13 @@ import logging import asyncio from typing import Union, List, Dict -from datetime import datetime from aiohttp.client import ClientSession from bs4 import BeautifulSoup from bs4.element import Tag from webutils import in_async_session, gather_in_batches -from .constants import (DEV_MODE, HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, FIN_STMT_ITEMS, - DEFAULT_CONCURRENT_BATCH_SIZE) -from .exceptions import UnknownFinancialStatementItem +from .constants import HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, DEFAULT_CONCURRENT_BATCH_SIZE log = logging.getLogger(__name__) @@ -19,7 +16,7 @@ log = logging.getLogger(__name__) # while its values will always be tuples with a length corresponding to the number of periods (columns) # and elements being the actual numbers, with the exception of the first key-value-pair, which will represent # the end dates of the reporting periods as strings (either years or quarters). -ResultDict = dict[str, Union[tuple[int], tuple[str]]] +ResultDict = dict[str, Union[tuple[float], tuple[str]]] @in_async_session @@ -41,36 +38,14 @@ def extract_end_dates(soup: BeautifulSoup) -> tuple[str]: return tuple(str(th.string).strip() for th in ths[1:-1]) -def is_relevant_table_row(tr: Tag) -> bool: - """ - Returns True if the item in the table row is marked as relevant. Additionally warns when an item is unknown. - """ - item_name = str(tr.td.div.string).strip() - try: - return FIN_STMT_ITEMS[item_name] - except KeyError: - log.warning(f"Unknown item name '{item_name}' found in financial statement.") - raise UnknownFinancialStatementItem - - -def find_relevant_table_rows(soup: BeautifulSoup) -> List[Tag]: +def get_all_table_rows(soup: BeautifulSoup) -> List[Tag]: """ Returns the table rows containing the data of interest. """ - now = datetime.utcnow() - trs = [] - for tr in soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr'): - try: - if is_relevant_table_row(tr): - trs.append(tr) - except UnknownFinancialStatementItem: - if DEV_MODE: - with open(f'mwfin_unknown_items_{now.strftime("%Y-%m-%d_%H-%M-%S")}.html', 'w') as f: - f.write(str(soup)) - return trs + return soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr') -def extract_row_data(tr: Tag) -> tuple[str, tuple[int]]: +def extract_row_data(tr: Tag) -> tuple[str, tuple[float]]: """ Returns the name of the item displayed in the table row (of a financial statement) as well as a number for each reporting period. @@ -78,7 +53,7 @@ def extract_row_data(tr: Tag) -> tuple[str, tuple[int]]: item_name = str(tr.td.div.string).strip() data_div = tr.find_all('td')[-1].div.div values_str: str = data_div.attrs['data-chart-data'] - values = tuple(int(float(s if s != '' else 0)) for s in values_str.split(',')) + values = tuple(float(s if s != '' else 0) for s in values_str.split(',')) return item_name, values @@ -87,7 +62,7 @@ def extract_all_data(soup: BeautifulSoup) -> ResultDict: Extracts financials from the page. """ output = {END_DATE: extract_end_dates(soup)} - for row in find_relevant_table_rows(soup): + for row in get_all_table_rows(soup): row_data = extract_row_data(row) output[row_data[0]] = row_data[1] return output diff --git a/tests/test_functions.py b/tests/test_functions.py index bd1e356..f0c7bf2 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -7,7 +7,6 @@ from bs4 import BeautifulSoup from mwfin import functions from mwfin.constants import HTML_PARSER, BASE_URL, FIN_STMT_URL_SUFFIX, IS, BS, CF, END_DATE -from mwfin.exceptions import UnknownFinancialStatementItem THIS_DIR = Path(__file__).parent @@ -64,18 +63,18 @@ class FunctionsTestCase(IsolatedAsyncioTestCase): def test_extract_row_data(self): test_row = self.test_soup.find('div', attrs={'class': 'financials'}).tbody.tr - expected_output = ('Cash & Short Term Investments', (11000000, -22000000)) + expected_output = ('foo', (1., -2.)) output = functions.extract_row_data(test_row) self.assertTupleEqual(expected_output, output) @patch.object(functions, 'extract_row_data') - @patch.object(functions, 'find_relevant_table_rows') + @patch.object(functions, 'get_all_table_rows') @patch.object(functions, 'extract_end_dates') - def test_extract_all_data(self, mock_extract_end_dates, mock_find_relevant_table_rows, mock_extract_row_data): + def test_extract_all_data(self, mock_extract_end_dates, mock_get_all_table_rows, mock_extract_row_data): test_end_dates = ('foo', 'bar') mock_extract_end_dates.return_value = test_end_dates test_relevant_rows = ['tr1', 'tr2'] - mock_find_relevant_table_rows.return_value = test_relevant_rows + mock_get_all_table_rows.return_value = test_relevant_rows test_row_data = ('item_name', (123, 456)) mock_extract_row_data.return_value = test_row_data expected_output = { @@ -86,7 +85,7 @@ class FunctionsTestCase(IsolatedAsyncioTestCase): output = functions.extract_all_data(self.test_soup) self.assertDictEqual(expected_output, output) mock_extract_end_dates.assert_called_once_with(self.test_soup) - mock_find_relevant_table_rows.assert_called_once_with(self.test_soup) + mock_get_all_table_rows.assert_called_once_with(self.test_soup) mock_extract_row_data.assert_has_calls([call(test_relevant_rows[0]), call(test_relevant_rows[1])]) @patch.object(functions, 'extract_all_data') @@ -198,9 +197,9 @@ class FunctionsTestCase(IsolatedAsyncioTestCase): symbol = 'foo' # Since the web request is mocked we always receive the same HTML markup. expected_output = { - BS: {END_DATE: ('End_Date_1', 'End_Date_2'), 'Cash & Short Term Investments': (11000000, -22000000)}, - IS: {END_DATE: ('End_Date_1', 'End_Date_2'), 'Cash & Short Term Investments': (11000000, -22000000)}, - CF: {END_DATE: ('End_Date_1', 'End_Date_2'), 'Cash & Short Term Investments': (11000000, -22000000)} + BS: {END_DATE: ('End_Date_1', 'End_Date_2'), 'foo': (1., -2.), 'bar': (2., -3.)}, + IS: {END_DATE: ('End_Date_1', 'End_Date_2'), 'foo': (1., -2.), 'bar': (2., -3.)}, + CF: {END_DATE: ('End_Date_1', 'End_Date_2'), 'foo': (1., -2.), 'bar': (2., -3.)} } output = await functions.get_all_financials(symbol, session=mock_session_obj) self.assertDictEqual(expected_output, output) diff --git a/tests/test_structure.html b/tests/test_structure.html index 7e59e1b..95630f4 100644 --- a/tests/test_structure.html +++ b/tests/test_structure.html @@ -23,16 +23,16 @@ -
Cash & Short Term Investments
Cash & Short Term Investments
+
foo
foo
-
+
-
Cash & Short Term Investments Growth
Cash & Short Term Investments Growth
- +
bar
bar
+