mwfin/src/mwfin/functions.py

153 lines
6.9 KiB
Python

import logging
import asyncio
from typing import Union, List, Dict
from aiohttp.client import ClientSession
from bs4 import BeautifulSoup
from bs4.element import Tag
from webutils import in_async_session, gather_in_batches
from .constants import HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, DEFAULT_CONCURRENT_BATCH_SIZE
log = logging.getLogger(__name__)
# The resulting dictionary's keys correspond to the name of the item (row) in the financial statement,
# while its values will always be tuples with a length corresponding to the number of periods (columns)
# and elements being the actual numbers, with the exception of the first key-value-pair, which will represent
# the end dates of the reporting periods as strings (either years or quarters).
ResultDict = dict[str, Union[tuple[float], tuple[str]]]
@in_async_session
async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSoup:
"""
Requests a web page and turns the response text into BeautifulSoup.
"""
async with session.get(url) as response:
html = await response.text()
return BeautifulSoup(html, HTML_PARSER)
def extract_end_dates(soup: BeautifulSoup) -> tuple[str]:
"""
Finds and returns the end dates of the reporting periods as strings (either years or quarters) from the page of a
financial statement.
"""
ths = soup.find('div', attrs={'class': 'financials'}).thead.find_all('th')
return tuple(str(th.string).strip() for th in ths[1:-1])
def get_all_table_rows(soup: BeautifulSoup) -> List[Tag]:
"""
Returns the table rows containing the data of interest.
"""
return soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr')
def extract_row_data(tr: Tag) -> tuple[str, tuple[float]]:
"""
Returns the name of the item displayed in the table row (of a financial statement)
as well as a number for each reporting period.
"""
item_name = str(tr.td.div.string).strip()
data_div = tr.find_all('td')[-1].div.div
values_str: str = data_div.attrs['data-chart-data']
values = tuple(float(s if s != '' else 0) for s in values_str.split(','))
return item_name, values
def extract_all_data(soup: BeautifulSoup) -> ResultDict:
"""
Extracts financials from the page.
"""
output = {END_DATE: extract_end_dates(soup)}
for row in get_all_table_rows(soup):
row_data = extract_row_data(row)
output[row_data[0]] = row_data[1]
return output
@in_async_session
async def _get_single_company_fin_stmt(statement: str, ticker_symbol: str, quarterly: bool = False,
session: ClientSession = None) -> ResultDict:
"""
Returns data from the specified financial statement of the specified company.
"""
log.info(f"Scraping {statement} for {ticker_symbol}")
url = f'{BASE_URL}/{ticker_symbol}/financials{FIN_STMT_URL_SUFFIX[statement]}'
if quarterly:
url += '/quarter'
soup = await soup_from_url(url, session)
return extract_all_data(soup)
@in_async_session
async def _get_multi_companies_fin_stmt(statement: str, *ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
if len(ticker_symbols) == 1:
return await _get_single_company_fin_stmt(statement, ticker_symbols[0], quarterly, session)
coroutines = (_get_single_company_fin_stmt(statement, symbol, quarterly, session) for symbol in ticker_symbols)
result_list = await gather_in_batches(concurrent_batch_size, *coroutines)
return {symbol: data for symbol, data in zip(ticker_symbols, result_list)}
@in_async_session
async def get_balance_sheet(*ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
"""
Returns data from the balance sheet of the specified company.
"""
return await _get_multi_companies_fin_stmt(BS, *ticker_symbols,
quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
session=session)
@in_async_session
async def get_income_statement(*ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
"""
Returns data from the income statement of the specified company.
"""
return await _get_multi_companies_fin_stmt(IS, *ticker_symbols,
quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
session=session)
@in_async_session
async def get_cash_flow_statement(*ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
"""
Returns data from the cash flow statement of the specified company.
"""
return await _get_multi_companies_fin_stmt(CF, *ticker_symbols,
quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
session=session)
@in_async_session
async def _get_single_company_all_financials(ticker_symbol: str, quarterly: bool = False,
session: ClientSession = None) -> Dict[str, ResultDict]:
coroutines = (_get_single_company_fin_stmt(stmt, ticker_symbol, quarterly, session) for stmt in (BS, IS, CF))
results = await asyncio.gather(*coroutines)
return {stmt: data for stmt, data in zip((BS, IS, CF), results)}
@in_async_session
async def get_all_financials(*ticker_symbols: str, quarterly: bool = False,
concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
session: ClientSession = None) -> Union[Dict[str, ResultDict],
Dict[str, Dict[str, ResultDict]]]:
"""
Returns all fundamentals (balance sheet, income statement and cash flow statement) of the specified company.
"""
if len(ticker_symbols) == 1:
return await _get_single_company_all_financials(ticker_symbols[0], quarterly, session)
coroutines = (_get_single_company_all_financials(symbol, quarterly, session) for symbol in ticker_symbols)
result_list = await gather_in_batches(concurrent_batch_size, *coroutines)
return {symbol: data for symbol, data in zip(ticker_symbols, result_list)}