Compare commits

...

5 Commits

5 changed files with 86 additions and 300 deletions

View File

@ -1,4 +1,3 @@
DEV_MODE = False
MAIN_LOGGER_NAME = 'mwfin'
HTML_PARSER = 'html.parser'
@ -12,214 +11,9 @@ FIN_STMT_URL_SUFFIX = {
IS: '',
CF: '/cash-flow'
}
END_DATE = 'End Date'
# All items marked `False` do not need to be scraped
# because they are calculated from other items (e.g. growth or ratios).
FIN_STMT_ITEMS = {
#################
# Balance Sheet #
#################
"Cash & Short Term Investments": True,
"Cash & Short Term Investments Growth": False,
"Cash Only": True,
"Short-Term Investments": True,
"Cash & ST Investments / Total Assets": False,
"Total Accounts Receivable": True,
"Total Accounts Receivable Growth": False,
"Accounts Receivables, Net": True,
"Accounts Receivables, Gross": True,
"Bad Debt/Doubtful Accounts": True,
"Other Receivable": True,
"Accounts Receivable Turnover": False,
"Inventories": True,
"Finished Goods": True,
"Work in Progress": True,
"Raw Materials": True,
"Progress Payments & Other": True,
"Other Current Assets": True,
"Miscellaneous Current Assets": True,
"Total Current Assets": True,
"Net Property, Plant & Equipment": True,
"Property, Plant & Equipment - Gross": True,
"Buildings": True,
"Land & Improvements": True,
"Computer Software and Equipment": True,
"Other Property, Plant & Equipment": True,
"Accumulated Depreciation": True,
"Total Investments and Advances": True,
"Other Long-Term Investments": True,
"Long-Term Note Receivables": True,
"Intangible Assets": True,
"Net Goodwill": True,
"Net Other Intangibles": True,
"Other Assets": True,
"Total Assets": True,
"Total Assets Growth": False,
"ST Debt & Current Portion LT Debt": True,
"Short Term Debt": True,
"Current Portion of Long Term Debt": True,
"Accounts Payable": True,
"Accounts Payable Growth": False,
"Income Tax Payable": True,
"Other Current Liabilities": True,
"Dividends Payable": True,
"Accrued Payroll": True,
"Miscellaneous Current Liabilities": True,
"Total Current Liabilities": True,
"Long-Term Debt": True,
"Long-Term Debt excl. Capitalized Leases": True,
"Non-Convertible Debt": True,
"Convertible Debt": True,
"Capitalized Lease Obligations": True,
"Provision for Risks & Charges": True,
"Deferred Taxes": True,
"Deferred Taxes - Credits": True,
"Deferred Taxes - Debit": True,
"Other Liabilities": True,
"Other Liabilities (excl. Deferred Income)": True,
"Deferred Income": True,
"Total Liabilities": True,
"Non-Equity Reserves": True,
"Total Liabilities / Total Assets": False,
"Preferred Stock (Carrying Value)": True,
"Redeemable Preferred Stock": True,
"Non-Redeemable Preferred Stock": True,
"Common Equity (Total)": True,
"Common Equity / Total Assets": False,
"Common Stock Par/Carry Value": True,
"Retained Earnings": True,
"ESOP Debt Guarantee": True,
"Cumulative Translation Adjustment/Unrealized For. Exch. Gain": True,
"Unrealized Gain/Loss Marketable Securities": True,
"Revaluation Reserves": True,
"Treasury Stock": True,
"Total Shareholders' Equity": True,
"Total Shareholders' Equity / Total Assets": False,
"Accumulated Minority Interest": True,
"Total Equity": True,
"Liabilities & Shareholders' Equity": True,
####################
# Income Statement #
####################
"Sales/Revenue": True,
"Sales Growth": False,
"Cost of Goods Sold (COGS) incl. D&A": True,
"COGS Growth": False,
"COGS excluding D&A": True,
"Depreciation & Amortization Expense": True,
"Depreciation": True,
"Amortization of Intangibles": True,
"Gross Income": True,
"Gross Income Growth": False,
"Gross Profit Margin": False,
"SG&A Expense": True,
"SGA Growth": False,
"Research & Development": True,
"Other SG&A": True,
"Other Operating Expense": True,
"Unusual Expense": True,
"EBIT after Unusual Expense": True,
"Non Operating Income/Expense": True,
"Non-Operating Interest Income": True,
"Equity in Affiliates (Pretax)": True,
"Interest Expense": True,
"Interest Expense Growth": False,
"Gross Interest Expense": True,
"Interest Capitalized": True,
"Pretax Income": True,
"Pretax Income Growth": False,
"Pretax Margin": False,
"Income Tax": True,
"Income Tax - Current Domestic": True,
"Income Tax - Current Foreign": True,
"Income Tax - Deferred Domestic": True,
"Income Tax - Deferred Foreign": True,
"Income Tax Credits": True,
"Equity in Affiliates": True,
"Other After Tax Income (Expense)": True,
"Consolidated Net Income": True,
"Minority Interest Expense": True,
"Net Income": True,
"Net Income Growth": False,
"Net Margin Growth": False,
"Extraordinaries & Discontinued Operations": True,
"Extra Items & Gain/Loss Sale Of Assets": True,
"Cumulative Effect - Accounting Chg": True,
"Discontinued Operations": True,
"Net Income After Extraordinaries": True,
"Preferred Dividends": True,
"Net Income Available to Common": True,
"EPS (Basic)": True,
"EPS (Basic) Growth": False,
"Basic Shares Outstanding": True,
"EPS (Diluted)": True,
"EPS (Diluted) Growth": False,
"Diluted Shares Outstanding": True,
"EBITDA": True,
"EBITDA Growth": False,
"EBITDA Margin": False,
#######################
# Cash Flow Statement #
#######################
"Net Income before Extraordinaries": True,
# "Net Income Growth": False,
"Depreciation, Depletion & Amortization": True,
"Depreciation and Depletion": True,
"Amortization of Intangible Assets": True,
"Deferred Taxes & Investment Tax Credit": True,
# "Deferred Taxes": True,
"Investment Tax Credit": True,
"Other Funds": True,
"Funds from Operations": True,
"Extraordinaries": True,
"Changes in Working Capital": True,
"Receivables": True,
# "Accounts Payable": True,
"Other Assets/Liabilities": True,
"Net Operating Cash Flow": True,
"Net Operating Cash Flow Growth": False,
"Net Operating Cash Flow / Sales": False,
"Capital Expenditures": True,
"Capital Expenditures Growth": False,
"Capital Expenditures / Sales": False,
"Capital Expenditures (Fixed Assets)": True,
"Capital Expenditures (Other Assets)": True,
"Net Assets from Acquisitions": True,
"Sale of Fixed Assets & Businesses": True,
"Purchase/Sale of Investments": True,
"Purchase of Investments": True,
"Sale/Maturity of Investments": True,
"Other Uses": True,
"Other Sources": True,
"Net Investing Cash Flow": True,
"Net Investing Cash Flow Growth": False,
"Net Investing Cash Flow / Sales": False,
"Cash Dividends Paid - Total": True,
"Common Dividends": True,
# "Preferred Dividends": True,
"Change in Capital Stock": True,
"Repurchase of Common & Preferred Stk.": True,
"Sale of Common & Preferred Stock": True,
"Proceeds from Stock Options": True,
"Other Proceeds from Sale of Stock": True,
"Issuance/Reduction of Debt, Net": True,
"Change in Current Debt": True,
"Change in Long-Term Debt": True,
"Issuance of Long-Term Debt": True,
"Reduction in Long-Term Debt": True,
# "Other Funds": True,
# "Other Uses": True,
# "Other Sources": True,
"Net Financing Cash Flow": True,
"Net Financing Cash Flow Growth": False,
"Net Financing Cash Flow / Sales": False,
"Exchange Rate Effect": True,
"Miscellaneous Funds": True,
"Net Change in Cash": True,
"Free Cash Flow": True,
"Free Cash Flow Growth": False,
"Free Cash Flow Yield": False,
INDENT_MAP = {
'indent--small': 1,
'indent--medium': 2,
'indent--large': 3,
}
END_DATE = 'End Date'

View File

@ -4,7 +4,3 @@ class WrongAssumptions(Exception):
class UnexpectedMarkup(WrongAssumptions):
pass
class UnknownFinancialStatementItem(WrongAssumptions):
pass

View File

@ -1,16 +1,14 @@
import logging
import asyncio
from typing import Union, List, Dict
from datetime import datetime
from typing import Union, Tuple, List, Dict
from aiohttp.client import ClientSession
from bs4 import BeautifulSoup
from bs4.element import Tag
from webutils import in_async_session, gather_in_batches
from .constants import (DEV_MODE, HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, FIN_STMT_ITEMS,
DEFAULT_CONCURRENT_BATCH_SIZE)
from .exceptions import UnknownFinancialStatementItem
from .constants import (HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, DEFAULT_CONCURRENT_BATCH_SIZE,
INDENT_MAP)
log = logging.getLogger(__name__)
@ -19,7 +17,9 @@ log = logging.getLogger(__name__)
# while its values will always be tuples with a length corresponding to the number of periods (columns)
# and elements being the actual numbers, with the exception of the first key-value-pair, which will represent
# the end dates of the reporting periods as strings (either years or quarters).
ResultDict = dict[str, Union[tuple[int], tuple[str]]]
HeaderData = Tuple[int, str, str, str, str, str]
RowData = Tuple[int, float, float, float, float, float]
ResultDict = dict[str, Union[HeaderData, RowData]]
@in_async_session
@ -32,45 +32,35 @@ async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSou
return BeautifulSoup(html, HTML_PARSER)
def extract_end_dates(soup: BeautifulSoup) -> tuple[str]:
def get_row_indent(tr: Tag) -> int:
try:
classes = tr.div.attrs['class']
except KeyError:
return 0
for class_name, indent in INDENT_MAP.items():
if class_name in classes:
return indent
return 0
def extract_end_dates(soup: BeautifulSoup) -> HeaderData:
"""
Finds and returns the end dates of the reporting periods as strings (either years or quarters) from the page of a
financial statement.
"""
ths = soup.find('div', attrs={'class': 'financials'}).thead.find_all('th')
return tuple(str(th.string).strip() for th in ths[1:-1])
tr = soup.find('div', attrs={'class': 'financials'}).thead.tr
ths = tr.find_all('th')
return (get_row_indent(tr), ) + tuple(str(th.string).strip() for th in ths[1:-1])
def is_relevant_table_row(tr: Tag) -> bool:
"""
Returns True if the item in the table row is marked as relevant. Additionally warns when an item is unknown.
"""
item_name = str(tr.td.div.string).strip()
try:
return FIN_STMT_ITEMS[item_name]
except KeyError:
log.warning(f"Unknown item name '{item_name}' found in financial statement.")
raise UnknownFinancialStatementItem
def find_relevant_table_rows(soup: BeautifulSoup) -> List[Tag]:
def get_all_table_rows(soup: BeautifulSoup) -> List[Tag]:
"""
Returns the table rows containing the data of interest.
"""
now = datetime.utcnow()
trs = []
for tr in soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr'):
try:
if is_relevant_table_row(tr):
trs.append(tr)
except UnknownFinancialStatementItem:
if DEV_MODE:
with open(f'mwfin_unknown_items_{now.strftime("%Y-%m-%d_%H-%M-%S")}.html', 'w') as f:
f.write(str(soup))
return trs
return soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr')
def extract_row_data(tr: Tag) -> tuple[str, tuple[int]]:
def extract_row_data(tr: Tag) -> Tuple[str, RowData]:
"""
Returns the name of the item displayed in the table row (of a financial statement)
as well as a number for each reporting period.
@ -78,8 +68,8 @@ def extract_row_data(tr: Tag) -> tuple[str, tuple[int]]:
item_name = str(tr.td.div.string).strip()
data_div = tr.find_all('td')[-1].div.div
values_str: str = data_div.attrs['data-chart-data']
values = tuple(int(float(s if s != '' else 0)) for s in values_str.split(','))
return item_name, values
values = tuple(float(s if s != '' else 0) for s in values_str.split(','))
return item_name, (get_row_indent(tr), ) + values
def extract_all_data(soup: BeautifulSoup) -> ResultDict:
@ -87,7 +77,7 @@ def extract_all_data(soup: BeautifulSoup) -> ResultDict:
Extracts financials from the page.
"""
output = {END_DATE: extract_end_dates(soup)}
for row in find_relevant_table_rows(soup):
for row in get_all_table_rows(soup):
row_data = extract_row_data(row)
output[row_data[0]] = row_data[1]
return output

View File

@ -7,7 +7,6 @@ from bs4 import BeautifulSoup
from mwfin import functions
from mwfin.constants import HTML_PARSER, BASE_URL, FIN_STMT_URL_SUFFIX, IS, BS, CF, END_DATE
from mwfin.exceptions import UnknownFinancialStatementItem
THIS_DIR = Path(__file__).parent
@ -52,59 +51,45 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
output = await functions.soup_from_url('baz', mock_session_obj)
self.assertEqual(expected_output, output)
def test_extract_end_dates(self):
expected_output = ('End_Date_1', 'End_Date_2')
def test_get_row_indent(self):
trs = self.test_soup.find_all('tr')
expected_output = 0
output = functions.get_row_indent(trs[0])
self.assertEqual(expected_output, output)
for i, tr in enumerate(trs[1:], start=1):
output = functions.get_row_indent(tr)
self.assertEqual(i, output)
@patch.object(functions, 'get_row_indent')
def test_extract_end_dates(self, mock_get_row_indent):
mock_get_row_indent.return_value = 0
expected_output = (0, 'End_Date_1', 'End_Date_2')
output = functions.extract_end_dates(self.test_soup)
self.assertTupleEqual(expected_output, output)
mock_get_row_indent.assert_called_once_with(self.test_soup.tr)
def test_is_relevant_table_row(self):
test_soup = BeautifulSoup('<tr><td><div> Cash & Short Term Investments </div></td></tr>', HTML_PARSER)
self.assertTrue(functions.is_relevant_table_row(test_soup.tr))
test_soup = BeautifulSoup('<tr><td><div> Cash & Short Term Investments Growth </div></td></tr>', HTML_PARSER)
self.assertFalse(functions.is_relevant_table_row(test_soup.tr))
test_soup = BeautifulSoup('<tr><td><div> baz </div></td></tr>', HTML_PARSER)
with self.assertRaises(UnknownFinancialStatementItem):
functions.is_relevant_table_row(test_soup.tr)
@patch.object(functions, 'open')
@patch.object(functions, 'is_relevant_table_row')
def test_find_relevant_table_rows(self, mock_is_relevant_table_row, mock_open):
mock_is_relevant_table_row.return_value = True
def test_get_all_table_rows(self):
expected_output = self.test_soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr')
tr0, tr1 = expected_output
output = functions.find_relevant_table_rows(self.test_soup)
self.assertListEqual(expected_output, output)
mock_is_relevant_table_row.assert_has_calls([call(tr0), call(tr1)])
mock_is_relevant_table_row.reset_mock()
output = functions.get_all_table_rows(self.test_soup)
self.assertSequenceEqual(expected_output, output)
mock_is_relevant_table_row.side_effect = UnknownFinancialStatementItem()
expected_output = self.test_soup.find_all('thistagdoesntexist')
output = functions.find_relevant_table_rows(self.test_soup)
self.assertListEqual(expected_output, output)
mock_is_relevant_table_row.assert_has_calls([call(tr0), call(tr1)])
mock_is_relevant_table_row.reset_mock()
mock_write = mock_open.return_value.__enter__.return_value.write
with patch.object(functions, 'DEV_MODE', new=True):
output = functions.find_relevant_table_rows(self.test_soup)
self.assertListEqual(expected_output, output)
mock_is_relevant_table_row.assert_has_calls([call(tr0), call(tr1)])
mock_write.assert_has_calls([call(str(self.test_soup)), call(str(self.test_soup))])
def test_extract_row_data(self):
@patch.object(functions, 'get_row_indent')
def test_extract_row_data(self, mock_get_row_indent):
mock_get_row_indent.return_value = 1
test_row = self.test_soup.find('div', attrs={'class': 'financials'}).tbody.tr
expected_output = ('Cash & Short Term Investments', (11000000, -22000000))
expected_output = ('foo', (1, 1., -2.))
output = functions.extract_row_data(test_row)
self.assertTupleEqual(expected_output, output)
mock_get_row_indent.assert_called_once_with(test_row)
@patch.object(functions, 'extract_row_data')
@patch.object(functions, 'find_relevant_table_rows')
@patch.object(functions, 'get_all_table_rows')
@patch.object(functions, 'extract_end_dates')
def test_extract_all_data(self, mock_extract_end_dates, mock_find_relevant_table_rows, mock_extract_row_data):
def test_extract_all_data(self, mock_extract_end_dates, mock_get_all_table_rows, mock_extract_row_data):
test_end_dates = ('foo', 'bar')
mock_extract_end_dates.return_value = test_end_dates
test_relevant_rows = ['tr1', 'tr2']
mock_find_relevant_table_rows.return_value = test_relevant_rows
mock_get_all_table_rows.return_value = test_relevant_rows
test_row_data = ('item_name', (123, 456))
mock_extract_row_data.return_value = test_row_data
expected_output = {
@ -115,7 +100,7 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
output = functions.extract_all_data(self.test_soup)
self.assertDictEqual(expected_output, output)
mock_extract_end_dates.assert_called_once_with(self.test_soup)
mock_find_relevant_table_rows.assert_called_once_with(self.test_soup)
mock_get_all_table_rows.assert_called_once_with(self.test_soup)
mock_extract_row_data.assert_has_calls([call(test_relevant_rows[0]), call(test_relevant_rows[1])])
@patch.object(functions, 'extract_all_data')
@ -227,9 +212,24 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
symbol = 'foo'
# Since the web request is mocked we always receive the same HTML markup.
expected_output = {
BS: {END_DATE: ('End_Date_1', 'End_Date_2'), 'Cash & Short Term Investments': (11000000, -22000000)},
IS: {END_DATE: ('End_Date_1', 'End_Date_2'), 'Cash & Short Term Investments': (11000000, -22000000)},
CF: {END_DATE: ('End_Date_1', 'End_Date_2'), 'Cash & Short Term Investments': (11000000, -22000000)}
BS: {
END_DATE: (0, 'End_Date_1', 'End_Date_2'),
'foo': (1, 1., -2.),
'bar': (2, 2., -3.),
'baz': (3, 3., -4.)
},
IS: {
END_DATE: (0, 'End_Date_1', 'End_Date_2'),
'foo': (1, 1., -2.),
'bar': (2, 2., -3.),
'baz': (3, 3., -4.)
},
CF: {
END_DATE: (0, 'End_Date_1', 'End_Date_2'),
'foo': (1, 1., -2.),
'bar': (2, 2., -3.),
'baz': (3, 3., -4.)
}
}
output = await functions.get_all_financials(symbol, session=mock_session_obj)
self.assertDictEqual(expected_output, output)

View File

@ -15,7 +15,7 @@
<table>
<thead>
<tr>
<th><div> !!Item </div><div> !!Item </div></th>
<th><div class="xyz abc"> !!Item </div><div> !!Item </div></th>
<th><div> End_Date_1 </div></th>
<th><div> End_Date_2 </div></th>
<th></th>
@ -23,16 +23,22 @@
</thead>
<tbody>
<tr>
<td><div> Cash & Short Term Investments </div><div> Cash & Short Term Investments </div></td>
<td><div class="xyz indent--small"> foo </div><div> foo </div></td>
<td></td>
<td></td>
<td><div> <div data-chart-data="11000000.0,-22000000.0"><div></div></td>
<td><div> <div data-chart-data="1.0,-2.0"><div></div></td>
</tr>
<tr>
<td><div> Cash & Short Term Investments Growth </div><div> Cash & Short Term Investments Growth </div></td>
<td><div class="xyz indent--medium"> bar </div><div> bar </div></td>
<td></td>
<td></td>
<td><div> <div data-chart-data="2.0,-3.0"><div></div></td>
</tr>
<tr>
<td><div class="xyz indent--large"> baz </div><div> baz </div></td>
<td></td>
<td></td>
<td><div> <div data-chart-data="3.0,-4.0"><div></div></td>
</tr>
</tbody>
</table>