diff --git a/src/mwfin/__main__.py b/src/mwfin/__main__.py index 9db4d13..859a189 100644 --- a/src/mwfin/__main__.py +++ b/src/mwfin/__main__.py @@ -19,6 +19,7 @@ VERBOSE = 'verbose' def parse_cli() -> dict: + """Returns the parsed command line arguments for the program as a dictionary.""" parser = ArgumentParser(description="Scrape company financials") parser.add_argument( TICKER_SYMBOL, @@ -36,7 +37,7 @@ def parse_cli() -> dict: type=int, default=DEFAULT_CONCURRENT_BATCH_SIZE, help="If multiple ticker symbols are passed, the company financials can be scraped concurrently. " - "This argument determines how many companies are scraped concurrently at any moment in time. " + "This argument determines how many companies are scraped concurrently. " "By default, they are scraped sequentially (i.e. a batch size of 1)." ) parser.add_argument( @@ -61,6 +62,10 @@ def parse_cli() -> dict: def configure_logging(verbosity: int) -> None: + """ + Sets up logging by adding a stdout-handler and setting the root logger's level according to the specified verbosity. + A verbosity of 0 (or less) sets the log level to CRITICAL. + """ root_logger = logging.getLogger() root_logger.addHandler(logging.StreamHandler()) root_logger.setLevel(logging.CRITICAL) @@ -73,6 +78,7 @@ def configure_logging(verbosity: int) -> None: async def main() -> None: + """Parses CLI arguments, configures logging to stderr, performs the scraping, and prints/saves the data.""" args = parse_cli() configure_logging(args[VERBOSE]) data = await get_all_financials(*args[TICKER_SYMBOL], quarterly=args[QUARTERLY], diff --git a/src/mwfin/functions.py b/src/mwfin/functions.py index a126c4e..2308761 100644 --- a/src/mwfin/functions.py +++ b/src/mwfin/functions.py @@ -1,10 +1,10 @@ import logging import asyncio -from typing import Union, Tuple, List, Dict +from typing import Union, Tuple, Dict from aiohttp.client import ClientSession from bs4 import BeautifulSoup -from bs4.element import Tag +from bs4.element import Tag, ResultSet from webutils import in_async_session, gather_in_batches from .constants import (HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, DEFAULT_CONCURRENT_BATCH_SIZE, @@ -13,12 +13,12 @@ from .constants import (HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_UR log = logging.getLogger(__name__) -# The resulting dictionary's keys correspond to the name of the item (row) in the financial statement, -# while its values will always be tuples with a length corresponding to the number of periods (columns) -# and elements being the actual numbers, with the exception of the first key-value-pair, which will represent -# the end dates of the reporting periods as strings (either years or quarters). +# First element in each Tuple is an integer indicating the row indent HeaderData = Tuple[int, str, str, str, str, str] RowData = Tuple[int, float, float, float, float, float] +# The resulting dictionary's keys correspond to the name of the item (row) in the financial statement. +# The first value is a tuple of the end dates of the reporting periods as strings (see above). +# The other values are the actual data tuples containing the financial figures. ResultDict = dict[str, Union[HeaderData, RowData]] @@ -26,6 +26,16 @@ ResultDict = dict[str, Union[HeaderData, RowData]] async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSoup: """ Requests a web page and turns the response text into BeautifulSoup. + + Args: + url: + The GET request is sent to this URL + session (optional): + If passed an `aiohttp.ClientSession` object, it will be used to perform the request. + Otherwise a new session is created and automatically closed after the request (see `@in_async_session`). + + Returns: + The parsed html response text as BeautifulSoup """ async with session.get(url) as response: html = await response.text() @@ -33,6 +43,17 @@ async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSou def get_row_indent(tr: Tag) -> int: + """ + Determines the visual indent of a table row. + Some positions in a financial statement have sub-positions below them indicated by indentation of the text in the + position name's cell. + + Args: + tr: The table row element + + Returns: + Each indentation level corresponds to an integer. 0 = no indentation, 1 = small, 2 = medium, 3 = large + """ try: classes = tr.div.attrs['class'] except KeyError: @@ -47,15 +68,27 @@ def extract_end_dates(soup: BeautifulSoup) -> HeaderData: """ Finds and returns the end dates of the reporting periods as strings (either years or quarters) from the page of a financial statement. + + Args: + soup: The parsed page containing the financial statement + + Returns: + A 6-tuple, the first element being the indent (in this case 0) and the rest being the actual end dates. """ tr = soup.find('div', attrs={'class': 'financials'}).thead.tr ths = tr.find_all('th') - return (get_row_indent(tr), ) + tuple(str(th.string).strip() for th in ths[1:-1]) + return (0, ) + tuple(str(th.string).strip() for th in ths[1:-1]) -def get_all_table_rows(soup: BeautifulSoup) -> List[Tag]: +def get_all_table_rows(soup: BeautifulSoup) -> ResultSet: """ Returns the table rows containing the data of interest. + + Args: + soup: The parsed page containing the financial statement + + Returns: + All table rows containing data from the financial statement """ return soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr') @@ -63,7 +96,14 @@ def get_all_table_rows(soup: BeautifulSoup) -> List[Tag]: def extract_row_data(tr: Tag) -> Tuple[str, RowData]: """ Returns the name of the item displayed in the table row (of a financial statement) - as well as a number for each reporting period. + as well as the position's indent and a figure for each reporting period. + + Args: + tr: A table row containing data from a financial statement + + Returns: + 2-tuple where the 1st element is the position's name and the second is a 6-tuple, of which the first element is + the indent and the rest are the actual figures. """ item_name = str(tr.td.div.string).strip() data_div = tr.find_all('td')[-1].div.div @@ -74,7 +114,14 @@ def extract_row_data(tr: Tag) -> Tuple[str, RowData]: def extract_all_data(soup: BeautifulSoup) -> ResultDict: """ - Extracts financials from the page. + Extracts financials from the page, which can contain either a balance sheet, income statement or cash flow + statement. + + Args: + soup: The parsed page containing a financial statement + + Returns: + Custom result dictionary (see `ResultDict`) """ output = {END_DATE: extract_end_dates(soup)} for row in get_all_table_rows(soup): @@ -84,10 +131,23 @@ def extract_all_data(soup: BeautifulSoup) -> ResultDict: @in_async_session -async def _get_single_company_fin_stmt(statement: str, ticker_symbol: str, quarterly: bool = False, - session: ClientSession = None) -> ResultDict: +async def get_single_company_fin_stmt(statement: str, ticker_symbol: str, quarterly: bool = False, + session: ClientSession = None) -> ResultDict: """ Returns data from the specified financial statement of the specified company. + + Args: + statement: + Must be one of the strings defined in the constants `BS`, `IS`, `CF` + ticker_symbol: + The company's stock ticker symbol + quarterly (optional): + If true the financial data of the last five quarters is scraped; otherwise (default) the last five years. + session (optional): + See `soup_from_url` + + Returns: + Custom result dictionary (see `ResultDict`) """ log.info(f"Scraping {statement} for {ticker_symbol}") url = f'{BASE_URL}/{ticker_symbol}/financials{FIN_STMT_URL_SUFFIX[statement]}' @@ -98,12 +158,34 @@ async def _get_single_company_fin_stmt(statement: str, ticker_symbol: str, quart @in_async_session -async def _get_multi_companies_fin_stmt(statement: str, *ticker_symbols: str, quarterly: bool = False, - concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE, - session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]: +async def get_multi_companies_fin_stmt(statement: str, *ticker_symbols: str, quarterly: bool = False, + concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE, + session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]: + """ + Returns data from the specified financial statement of the specified companies. + + Args: + statement: + See `get_single_company_fin_stmt` + ticker_symbols: + Arbitrary number of companies' stock ticker symbols + quarterly (optional): + See `get_single_company_fin_stmt` + concurrent_batch_size (optional): + If multiple ticker symbols are passed, the company financials can be scraped concurrently. + This argument determines how many companies are scraped concurrently. + By default, they are scraped sequentially (i.e. a batch size of 1). + session (optional): + See `get_single_company_fin_stmt` + + Returns: + If only one ticker symbol is passed, the `ResultDict` for that financial statement is returned. If multiple + symbols are passed, a dictionary is returned, where the keys are the symbols and the values are the + corresponding `ResultDict`s. + """ if len(ticker_symbols) == 1: - return await _get_single_company_fin_stmt(statement, ticker_symbols[0], quarterly, session) - coroutines = (_get_single_company_fin_stmt(statement, symbol, quarterly, session) for symbol in ticker_symbols) + return await get_single_company_fin_stmt(statement, ticker_symbols[0], quarterly, session) + coroutines = (get_single_company_fin_stmt(statement, symbol, quarterly, session) for symbol in ticker_symbols) result_list = await gather_in_batches(concurrent_batch_size, *coroutines) return {symbol: data for symbol, data in zip(ticker_symbols, result_list)} @@ -113,11 +195,12 @@ async def get_balance_sheet(*ticker_symbols: str, quarterly: bool = False, concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE, session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]: """ - Returns data from the balance sheet of the specified company. + Returns data from the balance sheet of the specified companies. + Convenience function around `get_multi_companies_fin_stmt` """ - return await _get_multi_companies_fin_stmt(BS, *ticker_symbols, - quarterly=quarterly, concurrent_batch_size=concurrent_batch_size, - session=session) + return await get_multi_companies_fin_stmt(BS, *ticker_symbols, + quarterly=quarterly, concurrent_batch_size=concurrent_batch_size, + session=session) @in_async_session @@ -125,11 +208,12 @@ async def get_income_statement(*ticker_symbols: str, quarterly: bool = False, concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE, session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]: """ - Returns data from the income statement of the specified company. + Returns data from the income statement of the specified companies. + Convenience function around `get_multi_companies_fin_stmt` """ - return await _get_multi_companies_fin_stmt(IS, *ticker_symbols, - quarterly=quarterly, concurrent_batch_size=concurrent_batch_size, - session=session) + return await get_multi_companies_fin_stmt(IS, *ticker_symbols, + quarterly=quarterly, concurrent_batch_size=concurrent_batch_size, + session=session) @in_async_session @@ -137,17 +221,34 @@ async def get_cash_flow_statement(*ticker_symbols: str, quarterly: bool = False, concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE, session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]: """ - Returns data from the cash flow statement of the specified company. + Returns data from the cash flow statement of the specified companies. + Convenience function around `get_multi_companies_fin_stmt` """ - return await _get_multi_companies_fin_stmt(CF, *ticker_symbols, - quarterly=quarterly, concurrent_batch_size=concurrent_batch_size, - session=session) + return await get_multi_companies_fin_stmt(CF, *ticker_symbols, + quarterly=quarterly, concurrent_batch_size=concurrent_batch_size, + session=session) @in_async_session -async def _get_single_company_all_financials(ticker_symbol: str, quarterly: bool = False, - session: ClientSession = None) -> Dict[str, ResultDict]: - coroutines = (_get_single_company_fin_stmt(stmt, ticker_symbol, quarterly, session) for stmt in (BS, IS, CF)) +async def get_single_company_all_financials(ticker_symbol: str, quarterly: bool = False, + session: ClientSession = None) -> Dict[str, ResultDict]: + """ + Returns data from all financial statements of the specified company. + Concurrently calls `get_single_company_fin_stmt` three times. + + Args: + ticker_symbol: + The company's stock ticker symbol + quarterly (optional): + See `get_single_company_fin_stmt` + session (optional): + See `get_single_company_fin_stmt` + + Returns: + A dictionary where the keys are the three different statement names and the values are the + corresponding `ResultDict`s + """ + coroutines = (get_single_company_fin_stmt(stmt, ticker_symbol, quarterly, session) for stmt in (BS, IS, CF)) results = await asyncio.gather(*coroutines) return {stmt: data for stmt, data in zip((BS, IS, CF), results)} @@ -158,10 +259,27 @@ async def get_all_financials(*ticker_symbols: str, quarterly: bool = False, session: ClientSession = None) -> Union[Dict[str, ResultDict], Dict[str, Dict[str, ResultDict]]]: """ - Returns all fundamentals (balance sheet, income statement and cash flow statement) of the specified company. + Returns all fundamentals (balance sheet, income statement and cash flow statement) of the specified companies. + + Args: + ticker_symbols: + Arbitrary number of companies' stock ticker symbols + quarterly (optional): + See `get_single_company_all_financials` + concurrent_batch_size (optional): + If multiple ticker symbols are passed, the company financials can be scraped concurrently. + This argument determines how many companies are scraped concurrently. + By default, they are scraped sequentially (i.e. a batch size of 1). + session (optional): + See `get_single_company_all_financials` + + Returns: + If only one ticker symbol is passed, the output of `get_single_company_all_financials` is returned. If multiple + symbols are passed, a dictionary is returned, where the keys are the symbols and the values are the + corresponding outputs of `get_single_company_all_financials`. """ if len(ticker_symbols) == 1: - return await _get_single_company_all_financials(ticker_symbols[0], quarterly, session) - coroutines = (_get_single_company_all_financials(symbol, quarterly, session) for symbol in ticker_symbols) + return await get_single_company_all_financials(ticker_symbols[0], quarterly, session) + coroutines = (get_single_company_all_financials(symbol, quarterly, session) for symbol in ticker_symbols) result_list = await gather_in_batches(concurrent_batch_size, *coroutines) return {symbol: data for symbol, data in zip(ticker_symbols, result_list)} diff --git a/tests/test_functions.py b/tests/test_functions.py index 0dbe293..39ae38e 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -108,7 +108,7 @@ class FunctionsTestCase(IsolatedAsyncioTestCase): @patch.object(functions, 'extract_all_data') @patch.object(functions, 'soup_from_url') - async def test__get_single_company_fin_stmt(self, mock_soup_from_url, mock_extract_all_data): + async def test_get_single_company_fin_stmt(self, mock_soup_from_url, mock_extract_all_data): mock_session = MagicMock() test_ticker, statement = 'bar', BS test_url = f'{BASE_URL}/{test_ticker}/financials{FIN_STMT_URL_SUFFIX[statement]}' @@ -116,7 +116,7 @@ class FunctionsTestCase(IsolatedAsyncioTestCase): mock_extract_all_data.return_value = expected_output = {'foo': 'bar'} quarterly = False - output = await functions._get_single_company_fin_stmt(statement, test_ticker, quarterly, mock_session) + output = await functions.get_single_company_fin_stmt(statement, test_ticker, quarterly, mock_session) self.assertDictEqual(expected_output, output) mock_soup_from_url.assert_called_once_with(test_url, mock_session) mock_extract_all_data.assert_called_once_with(mock_soup) @@ -124,33 +124,33 @@ class FunctionsTestCase(IsolatedAsyncioTestCase): mock_extract_all_data.reset_mock() quarterly = True - output = await functions._get_single_company_fin_stmt(statement, test_ticker, quarterly, mock_session) + output = await functions.get_single_company_fin_stmt(statement, test_ticker, quarterly, mock_session) self.assertDictEqual(expected_output, output) mock_soup_from_url.assert_called_once_with(test_url + '/quarter', mock_session) mock_extract_all_data.assert_called_once_with(mock_soup) - @patch.object(functions, '_get_single_company_fin_stmt') - async def test__get_multi_companies_fin_stmt(self, mock__get_single_company_fin_stmt): + @patch.object(functions, 'get_single_company_fin_stmt') + async def test_get_multi_companies_fin_stmt(self, mock_get_single_company_fin_stmt): statement, sym1, sym2, quarterly, mock_session = 'xyz', 'foo', 'bar', False, MagicMock() - mock__get_single_company_fin_stmt.return_value = expected_output = 'baz' - output = await functions._get_multi_companies_fin_stmt(statement, sym1, - quarterly=quarterly, session=mock_session) + mock_get_single_company_fin_stmt.return_value = expected_output = 'baz' + output = await functions.get_multi_companies_fin_stmt(statement, sym1, + quarterly=quarterly, session=mock_session) self.assertEqual(expected_output, output) - mock__get_single_company_fin_stmt.assert_called_once_with(statement, sym1, quarterly, mock_session) - mock__get_single_company_fin_stmt.reset_mock() + mock_get_single_company_fin_stmt.assert_called_once_with(statement, sym1, quarterly, mock_session) + mock_get_single_company_fin_stmt.reset_mock() expected_output = {sym1: expected_output, sym2: expected_output} - output = await functions._get_multi_companies_fin_stmt(statement, sym1, sym2, - quarterly=quarterly, session=mock_session) + output = await functions.get_multi_companies_fin_stmt(statement, sym1, sym2, + quarterly=quarterly, session=mock_session) self.assertDictEqual(expected_output, output) - mock__get_single_company_fin_stmt.assert_has_calls([ + mock_get_single_company_fin_stmt.assert_has_calls([ call(statement, sym1, quarterly, mock_session), call(statement, sym2, quarterly, mock_session) ]) - async def _helper_test_get_any_statement(self, stmt: str, mock__get_multi_companies_fin_stmt): + async def _helper_test_get_any_statement(self, stmt: str, mock_get_multi_companies_fin_stmt): sym1, sym2, quarterly, batch_size, mock_session = 'foo', 'bar', False, 2, MagicMock() - mock__get_multi_companies_fin_stmt.return_value = expected_output = 'baz' + mock_get_multi_companies_fin_stmt.return_value = expected_output = 'baz' if stmt == BS: function = functions.get_balance_sheet elif stmt == IS: @@ -161,50 +161,50 @@ class FunctionsTestCase(IsolatedAsyncioTestCase): raise ValueError output = await function(sym1, sym2, quarterly=quarterly, concurrent_batch_size=batch_size, session=mock_session) self.assertEqual(expected_output, output) - mock__get_multi_companies_fin_stmt.assert_called_once_with( + mock_get_multi_companies_fin_stmt.assert_called_once_with( stmt, sym1, sym2, quarterly=quarterly, concurrent_batch_size=batch_size, session=mock_session ) - @patch.object(functions, '_get_multi_companies_fin_stmt') - async def test_get_balance_sheet(self, mock__get_multi_companies_fin_stmt): - await self._helper_test_get_any_statement(BS, mock__get_multi_companies_fin_stmt) + @patch.object(functions, 'get_multi_companies_fin_stmt') + async def test_get_balance_sheet(self, mock_get_multi_companies_fin_stmt): + await self._helper_test_get_any_statement(BS, mock_get_multi_companies_fin_stmt) - @patch.object(functions, '_get_multi_companies_fin_stmt') - async def test_get_income_statement(self, mock__get_multi_companies_fin_stmt): - await self._helper_test_get_any_statement(IS, mock__get_multi_companies_fin_stmt) + @patch.object(functions, 'get_multi_companies_fin_stmt') + async def test_get_income_statement(self, mock_get_multi_companies_fin_stmt): + await self._helper_test_get_any_statement(IS, mock_get_multi_companies_fin_stmt) - @patch.object(functions, '_get_multi_companies_fin_stmt') - async def test_get_cash_flow_statement(self, mock__get_multi_companies_fin_stmt): - await self._helper_test_get_any_statement(CF, mock__get_multi_companies_fin_stmt) + @patch.object(functions, 'get_multi_companies_fin_stmt') + async def test_get_cash_flow_statement(self, mock_get_multi_companies_fin_stmt): + await self._helper_test_get_any_statement(CF, mock_get_multi_companies_fin_stmt) - @patch.object(functions, '_get_single_company_fin_stmt') - async def test__get_single_company_all_financials(self, mock__get_single_company_fin_stmt): + @patch.object(functions, 'get_single_company_fin_stmt') + async def test_get_single_company_all_financials(self, mock_get_single_company_fin_stmt): symbol, quarterly, mock_session = 'foo', False, MagicMock() - mock__get_single_company_fin_stmt.return_value = bar = 'bar' + mock_get_single_company_fin_stmt.return_value = bar = 'bar' expected_output = {BS: bar, IS: bar, CF: bar} - output = await functions._get_single_company_all_financials(symbol, quarterly, mock_session) + output = await functions.get_single_company_all_financials(symbol, quarterly, mock_session) self.assertDictEqual(expected_output, output) - mock__get_single_company_fin_stmt.assert_has_calls([ + mock_get_single_company_fin_stmt.assert_has_calls([ call(BS, symbol, quarterly, mock_session), call(IS, symbol, quarterly, mock_session), call(CF, symbol, quarterly, mock_session) ]) - @patch.object(functions, '_get_single_company_all_financials') - async def test_get_company_financials(self, mock__get_single_company_all_financials): - mock__get_single_company_all_financials.return_value = expected_output = 'baz' + @patch.object(functions, 'get_single_company_all_financials') + async def test_get_company_financials(self, mock_get_single_company_all_financials): + mock_get_single_company_all_financials.return_value = expected_output = 'baz' symbol, quarterly, mock_session = 'foo', False, MagicMock() output = await functions.get_all_financials(symbol, quarterly=quarterly, session=mock_session) self.assertEqual(expected_output, output) - mock__get_single_company_all_financials.assert_called_once_with(symbol, quarterly, mock_session) - mock__get_single_company_all_financials.reset_mock() + mock_get_single_company_all_financials.assert_called_once_with(symbol, quarterly, mock_session) + mock_get_single_company_all_financials.reset_mock() test_sym1, test_sym2 = 'x', 'y' expected_output = {test_sym1: expected_output, test_sym2: expected_output} output = await functions.get_all_financials(test_sym1, test_sym2, quarterly=quarterly, session=mock_session) self.assertDictEqual(expected_output, output) - mock__get_single_company_all_financials.assert_has_calls([ + mock_get_single_company_all_financials.assert_has_calls([ call(test_sym1, quarterly, mock_session), call(test_sym2, quarterly, mock_session) ])