mwfin/src/mwfin/functions.py

286 lines
12 KiB
Python
Raw Permalink Normal View History

2021-11-27 15:11:42 +01:00
import logging
import asyncio
from typing import Union, Tuple, Dict
from aiohttp.client import ClientSession
from bs4 import BeautifulSoup
from bs4.element import Tag, ResultSet
from webutils import in_async_session, gather_in_batches
2021-12-26 19:04:23 +01:00
from .constants import (HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, DEFAULT_CONCURRENT_BATCH_SIZE,
INDENT_MAP)
2021-11-26 21:38:10 +01:00
log = logging.getLogger(__name__)
2021-11-27 15:11:42 +01:00
# First element in each Tuple is an integer indicating the row indent
2021-12-26 19:04:23 +01:00
HeaderData = Tuple[int, str, str, str, str, str]
RowData = Tuple[int, float, float, float, float, float]
# The resulting dictionary's keys correspond to the name of the item (row) in the financial statement.
# The first value is a tuple of the end dates of the reporting periods as strings (see above).
# The other values are the actual data tuples containing the financial figures.
2021-12-26 19:04:23 +01:00
ResultDict = dict[str, Union[HeaderData, RowData]]
@in_async_session
async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSoup:
"""
Requests a web page and turns the response text into BeautifulSoup.
Args:
url:
The GET request is sent to this URL
session (optional):
If passed an `aiohttp.ClientSession` object, it will be used to perform the request.
Otherwise a new session is created and automatically closed after the request (see `@in_async_session`).
Returns:
The parsed html response text as BeautifulSoup
"""
2021-11-26 23:37:46 +01:00
async with session.get(url) as response:
html = await response.text()
return BeautifulSoup(html, HTML_PARSER)
2021-12-26 19:04:23 +01:00
def get_row_indent(tr: Tag) -> int:
"""
Determines the visual indent of a table row.
Some positions in a financial statement have sub-positions below them indicated by indentation of the text in the
position name's cell.
Args:
tr: The table row element
Returns:
Each indentation level corresponds to an integer. 0 = no indentation, 1 = small, 2 = medium, 3 = large
"""
2021-12-26 19:04:23 +01:00
try:
classes = tr.div.attrs['class']
except KeyError:
return 0
for class_name, indent in INDENT_MAP.items():
if class_name in classes:
return indent
return 0
def extract_end_dates(soup: BeautifulSoup) -> HeaderData:
"""
Finds and returns the end dates of the reporting periods as strings (either years or quarters) from the page of a
financial statement.
Args:
soup: The parsed page containing the financial statement
Returns:
A 6-tuple, the first element being the indent (in this case 0) and the rest being the actual end dates.
"""
2021-12-26 19:04:23 +01:00
tr = soup.find('div', attrs={'class': 'financials'}).thead.tr
ths = tr.find_all('th')
return (0, ) + tuple(str(th.string).strip() for th in ths[1:-1])
def get_all_table_rows(soup: BeautifulSoup) -> ResultSet:
"""
2021-12-26 17:55:34 +01:00
Returns the table rows containing the data of interest.
Args:
soup: The parsed page containing the financial statement
Returns:
All table rows containing data from the financial statement
"""
2021-12-26 17:55:34 +01:00
return soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr')
2021-12-26 19:04:23 +01:00
def extract_row_data(tr: Tag) -> Tuple[str, RowData]:
"""
Returns the name of the item displayed in the table row (of a financial statement)
as well as the position's indent and a figure for each reporting period.
Args:
tr: A table row containing data from a financial statement
Returns:
2-tuple where the 1st element is the position's name and the second is a 6-tuple, of which the first element is
the indent and the rest are the actual figures.
"""
2021-11-27 15:36:51 +01:00
item_name = str(tr.td.div.string).strip()
data_div = tr.find_all('td')[-1].div.div
values_str: str = data_div.attrs['data-chart-data']
2021-12-26 17:55:34 +01:00
values = tuple(float(s if s != '' else 0) for s in values_str.split(','))
2021-12-26 19:04:23 +01:00
return item_name, (get_row_indent(tr), ) + values
def extract_all_data(soup: BeautifulSoup) -> ResultDict:
"""
Extracts financials from the page, which can contain either a balance sheet, income statement or cash flow
statement.
Args:
soup: The parsed page containing a financial statement
Returns:
Custom result dictionary (see `ResultDict`)
"""
output = {END_DATE: extract_end_dates(soup)}
2021-12-26 17:55:34 +01:00
for row in get_all_table_rows(soup):
2021-11-27 15:36:51 +01:00
row_data = extract_row_data(row)
output[row_data[0]] = row_data[1]
return output
@in_async_session
async def get_single_company_fin_stmt(statement: str, ticker_symbol: str, quarterly: bool = False,
session: ClientSession = None) -> ResultDict:
2021-11-26 12:47:58 +01:00
"""
Returns data from the specified financial statement of the specified company.
Args:
statement:
Must be one of the strings defined in the constants `BS`, `IS`, `CF`
ticker_symbol:
The company's stock ticker symbol
quarterly (optional):
If true the financial data of the last five quarters is scraped; otherwise (default) the last five years.
session (optional):
See `soup_from_url`
Returns:
Custom result dictionary (see `ResultDict`)
2021-11-26 12:47:58 +01:00
"""
2021-12-03 16:01:00 +01:00
log.info(f"Scraping {statement} for {ticker_symbol}")
url = f'{BASE_URL}/{ticker_symbol}/financials{FIN_STMT_URL_SUFFIX[statement]}'
2021-11-27 18:11:16 +01:00
if quarterly:
url += '/quarter'
2021-11-27 17:23:36 +01:00
soup = await soup_from_url(url, session)
return extract_all_data(soup)
2021-11-26 12:47:58 +01:00
@in_async_session
async def get_multi_companies_fin_stmt(statement: str, *ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
"""
Returns data from the specified financial statement of the specified companies.
Args:
statement:
See `get_single_company_fin_stmt`
ticker_symbols:
Arbitrary number of companies' stock ticker symbols
quarterly (optional):
See `get_single_company_fin_stmt`
concurrent_batch_size (optional):
If multiple ticker symbols are passed, the company financials can be scraped concurrently.
This argument determines how many companies are scraped concurrently.
By default, they are scraped sequentially (i.e. a batch size of 1).
session (optional):
See `get_single_company_fin_stmt`
Returns:
If only one ticker symbol is passed, the `ResultDict` for that financial statement is returned. If multiple
symbols are passed, a dictionary is returned, where the keys are the symbols and the values are the
corresponding `ResultDict`s.
"""
if len(ticker_symbols) == 1:
return await get_single_company_fin_stmt(statement, ticker_symbols[0], quarterly, session)
coroutines = (get_single_company_fin_stmt(statement, symbol, quarterly, session) for symbol in ticker_symbols)
result_list = await gather_in_batches(concurrent_batch_size, *coroutines)
return {symbol: data for symbol, data in zip(ticker_symbols, result_list)}
@in_async_session
async def get_balance_sheet(*ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
2021-11-28 17:10:00 +01:00
session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
"""
Returns data from the balance sheet of the specified companies.
Convenience function around `get_multi_companies_fin_stmt`
"""
return await get_multi_companies_fin_stmt(BS, *ticker_symbols,
quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
session=session)
@in_async_session
async def get_income_statement(*ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
2021-11-28 17:10:00 +01:00
session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
"""
Returns data from the income statement of the specified companies.
Convenience function around `get_multi_companies_fin_stmt`
"""
return await get_multi_companies_fin_stmt(IS, *ticker_symbols,
quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
session=session)
@in_async_session
async def get_cash_flow_statement(*ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
2021-11-28 17:10:00 +01:00
session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
"""
Returns data from the cash flow statement of the specified companies.
Convenience function around `get_multi_companies_fin_stmt`
"""
return await get_multi_companies_fin_stmt(CF, *ticker_symbols,
quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
session=session)
@in_async_session
async def get_single_company_all_financials(ticker_symbol: str, quarterly: bool = False,
session: ClientSession = None) -> Dict[str, ResultDict]:
"""
Returns data from all financial statements of the specified company.
Concurrently calls `get_single_company_fin_stmt` three times.
Args:
ticker_symbol:
The company's stock ticker symbol
quarterly (optional):
See `get_single_company_fin_stmt`
session (optional):
See `get_single_company_fin_stmt`
Returns:
A dictionary where the keys are the three different statement names and the values are the
corresponding `ResultDict`s
"""
coroutines = (get_single_company_fin_stmt(stmt, ticker_symbol, quarterly, session) for stmt in (BS, IS, CF))
results = await asyncio.gather(*coroutines)
return {stmt: data for stmt, data in zip((BS, IS, CF), results)}
@in_async_session
async def get_all_financials(*ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
session: ClientSession = None) -> Union[Dict[str, ResultDict],
Dict[str, Dict[str, ResultDict]]]:
"""
Returns all fundamentals (balance sheet, income statement and cash flow statement) of the specified companies.
Args:
ticker_symbols:
Arbitrary number of companies' stock ticker symbols
quarterly (optional):
See `get_single_company_all_financials`
concurrent_batch_size (optional):
If multiple ticker symbols are passed, the company financials can be scraped concurrently.
This argument determines how many companies are scraped concurrently.
By default, they are scraped sequentially (i.e. a batch size of 1).
session (optional):
See `get_single_company_all_financials`
Returns:
If only one ticker symbol is passed, the output of `get_single_company_all_financials` is returned. If multiple
symbols are passed, a dictionary is returned, where the keys are the symbols and the values are the
corresponding outputs of `get_single_company_all_financials`.
"""
2021-11-28 16:42:21 +01:00
if len(ticker_symbols) == 1:
return await get_single_company_all_financials(ticker_symbols[0], quarterly, session)
coroutines = (get_single_company_all_financials(symbol, quarterly, session) for symbol in ticker_symbols)
result_list = await gather_in_batches(concurrent_batch_size, *coroutines)
return {symbol: data for symbol, data in zip(ticker_symbols, result_list)}