extensive docstrings; made all functions public (not protected); minor refactoring

This commit is contained in:
Maximilian Fajnberg 2021-12-28 23:59:54 +01:00
parent f29f22a52e
commit 911fa15f46
3 changed files with 196 additions and 72 deletions

View File

@ -19,6 +19,7 @@ VERBOSE = 'verbose'
def parse_cli() -> dict:
"""Returns the parsed command line arguments for the program as a dictionary."""
parser = ArgumentParser(description="Scrape company financials")
parser.add_argument(
TICKER_SYMBOL,
@ -36,7 +37,7 @@ def parse_cli() -> dict:
type=int,
default=DEFAULT_CONCURRENT_BATCH_SIZE,
help="If multiple ticker symbols are passed, the company financials can be scraped concurrently. "
"This argument determines how many companies are scraped concurrently at any moment in time. "
"This argument determines how many companies are scraped concurrently. "
"By default, they are scraped sequentially (i.e. a batch size of 1)."
)
parser.add_argument(
@ -61,6 +62,10 @@ def parse_cli() -> dict:
def configure_logging(verbosity: int) -> None:
"""
Sets up logging by adding a stdout-handler and setting the root logger's level according to the specified verbosity.
A verbosity of 0 (or less) sets the log level to CRITICAL.
"""
root_logger = logging.getLogger()
root_logger.addHandler(logging.StreamHandler())
root_logger.setLevel(logging.CRITICAL)
@ -73,6 +78,7 @@ def configure_logging(verbosity: int) -> None:
async def main() -> None:
"""Parses CLI arguments, configures logging to stderr, performs the scraping, and prints/saves the data."""
args = parse_cli()
configure_logging(args[VERBOSE])
data = await get_all_financials(*args[TICKER_SYMBOL], quarterly=args[QUARTERLY],

View File

@ -1,10 +1,10 @@
import logging
import asyncio
from typing import Union, Tuple, List, Dict
from typing import Union, Tuple, Dict
from aiohttp.client import ClientSession
from bs4 import BeautifulSoup
from bs4.element import Tag
from bs4.element import Tag, ResultSet
from webutils import in_async_session, gather_in_batches
from .constants import (HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, DEFAULT_CONCURRENT_BATCH_SIZE,
@ -13,12 +13,12 @@ from .constants import (HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_UR
log = logging.getLogger(__name__)
# The resulting dictionary's keys correspond to the name of the item (row) in the financial statement,
# while its values will always be tuples with a length corresponding to the number of periods (columns)
# and elements being the actual numbers, with the exception of the first key-value-pair, which will represent
# the end dates of the reporting periods as strings (either years or quarters).
# First element in each Tuple is an integer indicating the row indent
HeaderData = Tuple[int, str, str, str, str, str]
RowData = Tuple[int, float, float, float, float, float]
# The resulting dictionary's keys correspond to the name of the item (row) in the financial statement.
# The first value is a tuple of the end dates of the reporting periods as strings (see above).
# The other values are the actual data tuples containing the financial figures.
ResultDict = dict[str, Union[HeaderData, RowData]]
@ -26,6 +26,16 @@ ResultDict = dict[str, Union[HeaderData, RowData]]
async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSoup:
"""
Requests a web page and turns the response text into BeautifulSoup.
Args:
url:
The GET request is sent to this URL
session (optional):
If passed an `aiohttp.ClientSession` object, it will be used to perform the request.
Otherwise a new session is created and automatically closed after the request (see `@in_async_session`).
Returns:
The parsed html response text as BeautifulSoup
"""
async with session.get(url) as response:
html = await response.text()
@ -33,6 +43,17 @@ async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSou
def get_row_indent(tr: Tag) -> int:
"""
Determines the visual indent of a table row.
Some positions in a financial statement have sub-positions below them indicated by indentation of the text in the
position name's cell.
Args:
tr: The table row element
Returns:
Each indentation level corresponds to an integer. 0 = no indentation, 1 = small, 2 = medium, 3 = large
"""
try:
classes = tr.div.attrs['class']
except KeyError:
@ -47,15 +68,27 @@ def extract_end_dates(soup: BeautifulSoup) -> HeaderData:
"""
Finds and returns the end dates of the reporting periods as strings (either years or quarters) from the page of a
financial statement.
Args:
soup: The parsed page containing the financial statement
Returns:
A 6-tuple, the first element being the indent (in this case 0) and the rest being the actual end dates.
"""
tr = soup.find('div', attrs={'class': 'financials'}).thead.tr
ths = tr.find_all('th')
return (get_row_indent(tr), ) + tuple(str(th.string).strip() for th in ths[1:-1])
return (0, ) + tuple(str(th.string).strip() for th in ths[1:-1])
def get_all_table_rows(soup: BeautifulSoup) -> List[Tag]:
def get_all_table_rows(soup: BeautifulSoup) -> ResultSet:
"""
Returns the table rows containing the data of interest.
Args:
soup: The parsed page containing the financial statement
Returns:
All table rows containing data from the financial statement
"""
return soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr')
@ -63,7 +96,14 @@ def get_all_table_rows(soup: BeautifulSoup) -> List[Tag]:
def extract_row_data(tr: Tag) -> Tuple[str, RowData]:
"""
Returns the name of the item displayed in the table row (of a financial statement)
as well as a number for each reporting period.
as well as the position's indent and a figure for each reporting period.
Args:
tr: A table row containing data from a financial statement
Returns:
2-tuple where the 1st element is the position's name and the second is a 6-tuple, of which the first element is
the indent and the rest are the actual figures.
"""
item_name = str(tr.td.div.string).strip()
data_div = tr.find_all('td')[-1].div.div
@ -74,7 +114,14 @@ def extract_row_data(tr: Tag) -> Tuple[str, RowData]:
def extract_all_data(soup: BeautifulSoup) -> ResultDict:
"""
Extracts financials from the page.
Extracts financials from the page, which can contain either a balance sheet, income statement or cash flow
statement.
Args:
soup: The parsed page containing a financial statement
Returns:
Custom result dictionary (see `ResultDict`)
"""
output = {END_DATE: extract_end_dates(soup)}
for row in get_all_table_rows(soup):
@ -84,10 +131,23 @@ def extract_all_data(soup: BeautifulSoup) -> ResultDict:
@in_async_session
async def _get_single_company_fin_stmt(statement: str, ticker_symbol: str, quarterly: bool = False,
session: ClientSession = None) -> ResultDict:
async def get_single_company_fin_stmt(statement: str, ticker_symbol: str, quarterly: bool = False,
session: ClientSession = None) -> ResultDict:
"""
Returns data from the specified financial statement of the specified company.
Args:
statement:
Must be one of the strings defined in the constants `BS`, `IS`, `CF`
ticker_symbol:
The company's stock ticker symbol
quarterly (optional):
If true the financial data of the last five quarters is scraped; otherwise (default) the last five years.
session (optional):
See `soup_from_url`
Returns:
Custom result dictionary (see `ResultDict`)
"""
log.info(f"Scraping {statement} for {ticker_symbol}")
url = f'{BASE_URL}/{ticker_symbol}/financials{FIN_STMT_URL_SUFFIX[statement]}'
@ -98,12 +158,34 @@ async def _get_single_company_fin_stmt(statement: str, ticker_symbol: str, quart
@in_async_session
async def _get_multi_companies_fin_stmt(statement: str, *ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
async def get_multi_companies_fin_stmt(statement: str, *ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
"""
Returns data from the specified financial statement of the specified companies.
Args:
statement:
See `get_single_company_fin_stmt`
ticker_symbols:
Arbitrary number of companies' stock ticker symbols
quarterly (optional):
See `get_single_company_fin_stmt`
concurrent_batch_size (optional):
If multiple ticker symbols are passed, the company financials can be scraped concurrently.
This argument determines how many companies are scraped concurrently.
By default, they are scraped sequentially (i.e. a batch size of 1).
session (optional):
See `get_single_company_fin_stmt`
Returns:
If only one ticker symbol is passed, the `ResultDict` for that financial statement is returned. If multiple
symbols are passed, a dictionary is returned, where the keys are the symbols and the values are the
corresponding `ResultDict`s.
"""
if len(ticker_symbols) == 1:
return await _get_single_company_fin_stmt(statement, ticker_symbols[0], quarterly, session)
coroutines = (_get_single_company_fin_stmt(statement, symbol, quarterly, session) for symbol in ticker_symbols)
return await get_single_company_fin_stmt(statement, ticker_symbols[0], quarterly, session)
coroutines = (get_single_company_fin_stmt(statement, symbol, quarterly, session) for symbol in ticker_symbols)
result_list = await gather_in_batches(concurrent_batch_size, *coroutines)
return {symbol: data for symbol, data in zip(ticker_symbols, result_list)}
@ -113,11 +195,12 @@ async def get_balance_sheet(*ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
"""
Returns data from the balance sheet of the specified company.
Returns data from the balance sheet of the specified companies.
Convenience function around `get_multi_companies_fin_stmt`
"""
return await _get_multi_companies_fin_stmt(BS, *ticker_symbols,
quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
session=session)
return await get_multi_companies_fin_stmt(BS, *ticker_symbols,
quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
session=session)
@in_async_session
@ -125,11 +208,12 @@ async def get_income_statement(*ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
"""
Returns data from the income statement of the specified company.
Returns data from the income statement of the specified companies.
Convenience function around `get_multi_companies_fin_stmt`
"""
return await _get_multi_companies_fin_stmt(IS, *ticker_symbols,
quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
session=session)
return await get_multi_companies_fin_stmt(IS, *ticker_symbols,
quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
session=session)
@in_async_session
@ -137,17 +221,34 @@ async def get_cash_flow_statement(*ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
"""
Returns data from the cash flow statement of the specified company.
Returns data from the cash flow statement of the specified companies.
Convenience function around `get_multi_companies_fin_stmt`
"""
return await _get_multi_companies_fin_stmt(CF, *ticker_symbols,
quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
session=session)
return await get_multi_companies_fin_stmt(CF, *ticker_symbols,
quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
session=session)
@in_async_session
async def _get_single_company_all_financials(ticker_symbol: str, quarterly: bool = False,
session: ClientSession = None) -> Dict[str, ResultDict]:
coroutines = (_get_single_company_fin_stmt(stmt, ticker_symbol, quarterly, session) for stmt in (BS, IS, CF))
async def get_single_company_all_financials(ticker_symbol: str, quarterly: bool = False,
session: ClientSession = None) -> Dict[str, ResultDict]:
"""
Returns data from all financial statements of the specified company.
Concurrently calls `get_single_company_fin_stmt` three times.
Args:
ticker_symbol:
The company's stock ticker symbol
quarterly (optional):
See `get_single_company_fin_stmt`
session (optional):
See `get_single_company_fin_stmt`
Returns:
A dictionary where the keys are the three different statement names and the values are the
corresponding `ResultDict`s
"""
coroutines = (get_single_company_fin_stmt(stmt, ticker_symbol, quarterly, session) for stmt in (BS, IS, CF))
results = await asyncio.gather(*coroutines)
return {stmt: data for stmt, data in zip((BS, IS, CF), results)}
@ -158,10 +259,27 @@ async def get_all_financials(*ticker_symbols: str, quarterly: bool = False,
session: ClientSession = None) -> Union[Dict[str, ResultDict],
Dict[str, Dict[str, ResultDict]]]:
"""
Returns all fundamentals (balance sheet, income statement and cash flow statement) of the specified company.
Returns all fundamentals (balance sheet, income statement and cash flow statement) of the specified companies.
Args:
ticker_symbols:
Arbitrary number of companies' stock ticker symbols
quarterly (optional):
See `get_single_company_all_financials`
concurrent_batch_size (optional):
If multiple ticker symbols are passed, the company financials can be scraped concurrently.
This argument determines how many companies are scraped concurrently.
By default, they are scraped sequentially (i.e. a batch size of 1).
session (optional):
See `get_single_company_all_financials`
Returns:
If only one ticker symbol is passed, the output of `get_single_company_all_financials` is returned. If multiple
symbols are passed, a dictionary is returned, where the keys are the symbols and the values are the
corresponding outputs of `get_single_company_all_financials`.
"""
if len(ticker_symbols) == 1:
return await _get_single_company_all_financials(ticker_symbols[0], quarterly, session)
coroutines = (_get_single_company_all_financials(symbol, quarterly, session) for symbol in ticker_symbols)
return await get_single_company_all_financials(ticker_symbols[0], quarterly, session)
coroutines = (get_single_company_all_financials(symbol, quarterly, session) for symbol in ticker_symbols)
result_list = await gather_in_batches(concurrent_batch_size, *coroutines)
return {symbol: data for symbol, data in zip(ticker_symbols, result_list)}

View File

@ -108,7 +108,7 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
@patch.object(functions, 'extract_all_data')
@patch.object(functions, 'soup_from_url')
async def test__get_single_company_fin_stmt(self, mock_soup_from_url, mock_extract_all_data):
async def test_get_single_company_fin_stmt(self, mock_soup_from_url, mock_extract_all_data):
mock_session = MagicMock()
test_ticker, statement = 'bar', BS
test_url = f'{BASE_URL}/{test_ticker}/financials{FIN_STMT_URL_SUFFIX[statement]}'
@ -116,7 +116,7 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
mock_extract_all_data.return_value = expected_output = {'foo': 'bar'}
quarterly = False
output = await functions._get_single_company_fin_stmt(statement, test_ticker, quarterly, mock_session)
output = await functions.get_single_company_fin_stmt(statement, test_ticker, quarterly, mock_session)
self.assertDictEqual(expected_output, output)
mock_soup_from_url.assert_called_once_with(test_url, mock_session)
mock_extract_all_data.assert_called_once_with(mock_soup)
@ -124,33 +124,33 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
mock_extract_all_data.reset_mock()
quarterly = True
output = await functions._get_single_company_fin_stmt(statement, test_ticker, quarterly, mock_session)
output = await functions.get_single_company_fin_stmt(statement, test_ticker, quarterly, mock_session)
self.assertDictEqual(expected_output, output)
mock_soup_from_url.assert_called_once_with(test_url + '/quarter', mock_session)
mock_extract_all_data.assert_called_once_with(mock_soup)
@patch.object(functions, '_get_single_company_fin_stmt')
async def test__get_multi_companies_fin_stmt(self, mock__get_single_company_fin_stmt):
@patch.object(functions, 'get_single_company_fin_stmt')
async def test_get_multi_companies_fin_stmt(self, mock_get_single_company_fin_stmt):
statement, sym1, sym2, quarterly, mock_session = 'xyz', 'foo', 'bar', False, MagicMock()
mock__get_single_company_fin_stmt.return_value = expected_output = 'baz'
output = await functions._get_multi_companies_fin_stmt(statement, sym1,
quarterly=quarterly, session=mock_session)
mock_get_single_company_fin_stmt.return_value = expected_output = 'baz'
output = await functions.get_multi_companies_fin_stmt(statement, sym1,
quarterly=quarterly, session=mock_session)
self.assertEqual(expected_output, output)
mock__get_single_company_fin_stmt.assert_called_once_with(statement, sym1, quarterly, mock_session)
mock__get_single_company_fin_stmt.reset_mock()
mock_get_single_company_fin_stmt.assert_called_once_with(statement, sym1, quarterly, mock_session)
mock_get_single_company_fin_stmt.reset_mock()
expected_output = {sym1: expected_output, sym2: expected_output}
output = await functions._get_multi_companies_fin_stmt(statement, sym1, sym2,
quarterly=quarterly, session=mock_session)
output = await functions.get_multi_companies_fin_stmt(statement, sym1, sym2,
quarterly=quarterly, session=mock_session)
self.assertDictEqual(expected_output, output)
mock__get_single_company_fin_stmt.assert_has_calls([
mock_get_single_company_fin_stmt.assert_has_calls([
call(statement, sym1, quarterly, mock_session),
call(statement, sym2, quarterly, mock_session)
])
async def _helper_test_get_any_statement(self, stmt: str, mock__get_multi_companies_fin_stmt):
async def _helper_test_get_any_statement(self, stmt: str, mock_get_multi_companies_fin_stmt):
sym1, sym2, quarterly, batch_size, mock_session = 'foo', 'bar', False, 2, MagicMock()
mock__get_multi_companies_fin_stmt.return_value = expected_output = 'baz'
mock_get_multi_companies_fin_stmt.return_value = expected_output = 'baz'
if stmt == BS:
function = functions.get_balance_sheet
elif stmt == IS:
@ -161,50 +161,50 @@ class FunctionsTestCase(IsolatedAsyncioTestCase):
raise ValueError
output = await function(sym1, sym2, quarterly=quarterly, concurrent_batch_size=batch_size, session=mock_session)
self.assertEqual(expected_output, output)
mock__get_multi_companies_fin_stmt.assert_called_once_with(
mock_get_multi_companies_fin_stmt.assert_called_once_with(
stmt, sym1, sym2, quarterly=quarterly, concurrent_batch_size=batch_size, session=mock_session
)
@patch.object(functions, '_get_multi_companies_fin_stmt')
async def test_get_balance_sheet(self, mock__get_multi_companies_fin_stmt):
await self._helper_test_get_any_statement(BS, mock__get_multi_companies_fin_stmt)
@patch.object(functions, 'get_multi_companies_fin_stmt')
async def test_get_balance_sheet(self, mock_get_multi_companies_fin_stmt):
await self._helper_test_get_any_statement(BS, mock_get_multi_companies_fin_stmt)
@patch.object(functions, '_get_multi_companies_fin_stmt')
async def test_get_income_statement(self, mock__get_multi_companies_fin_stmt):
await self._helper_test_get_any_statement(IS, mock__get_multi_companies_fin_stmt)
@patch.object(functions, 'get_multi_companies_fin_stmt')
async def test_get_income_statement(self, mock_get_multi_companies_fin_stmt):
await self._helper_test_get_any_statement(IS, mock_get_multi_companies_fin_stmt)
@patch.object(functions, '_get_multi_companies_fin_stmt')
async def test_get_cash_flow_statement(self, mock__get_multi_companies_fin_stmt):
await self._helper_test_get_any_statement(CF, mock__get_multi_companies_fin_stmt)
@patch.object(functions, 'get_multi_companies_fin_stmt')
async def test_get_cash_flow_statement(self, mock_get_multi_companies_fin_stmt):
await self._helper_test_get_any_statement(CF, mock_get_multi_companies_fin_stmt)
@patch.object(functions, '_get_single_company_fin_stmt')
async def test__get_single_company_all_financials(self, mock__get_single_company_fin_stmt):
@patch.object(functions, 'get_single_company_fin_stmt')
async def test_get_single_company_all_financials(self, mock_get_single_company_fin_stmt):
symbol, quarterly, mock_session = 'foo', False, MagicMock()
mock__get_single_company_fin_stmt.return_value = bar = 'bar'
mock_get_single_company_fin_stmt.return_value = bar = 'bar'
expected_output = {BS: bar, IS: bar, CF: bar}
output = await functions._get_single_company_all_financials(symbol, quarterly, mock_session)
output = await functions.get_single_company_all_financials(symbol, quarterly, mock_session)
self.assertDictEqual(expected_output, output)
mock__get_single_company_fin_stmt.assert_has_calls([
mock_get_single_company_fin_stmt.assert_has_calls([
call(BS, symbol, quarterly, mock_session),
call(IS, symbol, quarterly, mock_session),
call(CF, symbol, quarterly, mock_session)
])
@patch.object(functions, '_get_single_company_all_financials')
async def test_get_company_financials(self, mock__get_single_company_all_financials):
mock__get_single_company_all_financials.return_value = expected_output = 'baz'
@patch.object(functions, 'get_single_company_all_financials')
async def test_get_company_financials(self, mock_get_single_company_all_financials):
mock_get_single_company_all_financials.return_value = expected_output = 'baz'
symbol, quarterly, mock_session = 'foo', False, MagicMock()
output = await functions.get_all_financials(symbol, quarterly=quarterly, session=mock_session)
self.assertEqual(expected_output, output)
mock__get_single_company_all_financials.assert_called_once_with(symbol, quarterly, mock_session)
mock__get_single_company_all_financials.reset_mock()
mock_get_single_company_all_financials.assert_called_once_with(symbol, quarterly, mock_session)
mock_get_single_company_all_financials.reset_mock()
test_sym1, test_sym2 = 'x', 'y'
expected_output = {test_sym1: expected_output, test_sym2: expected_output}
output = await functions.get_all_financials(test_sym1, test_sym2,
quarterly=quarterly, session=mock_session)
self.assertDictEqual(expected_output, output)
mock__get_single_company_all_financials.assert_has_calls([
mock_get_single_company_all_financials.assert_has_calls([
call(test_sym1, quarterly, mock_session),
call(test_sym2, quarterly, mock_session)
])