import logging import asyncio from typing import Union, List, Dict from datetime import datetime from aiohttp.client import ClientSession from bs4 import BeautifulSoup from bs4.element import Tag from webutils import in_async_session, gather_in_batches from .constants import (DEV_MODE, HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, FIN_STMT_ITEMS, DEFAULT_CONCURRENT_BATCH_SIZE) from .exceptions import UnknownFinancialStatementItem log = logging.getLogger(__name__) # The resulting dictionary's keys correspond to the name of the item (row) in the financial statement, # while its values will always be tuples with a length corresponding to the number of periods (columns) # and elements being the actual numbers, with the exception of the first key-value-pair, which will represent # the end dates of the reporting periods as strings (either years or quarters). ResultDict = dict[str, Union[tuple[int], tuple[str]]] @in_async_session async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSoup: """ Requests a web page and turns the response text into BeautifulSoup. """ async with session.get(url) as response: html = await response.text() return BeautifulSoup(html, HTML_PARSER) def extract_end_dates(soup: BeautifulSoup) -> tuple[str]: """ Finds and returns the end dates of the reporting periods as strings (either years or quarters) from the page of a financial statement. """ ths = soup.find('div', attrs={'class': 'financials'}).thead.find_all('th') return tuple(str(th.string).strip() for th in ths[1:-1]) def is_relevant_table_row(tr: Tag) -> bool: """ Returns True if the item in the table row is marked as relevant. Additionally warns when an item is unknown. """ item_name = str(tr.td.div.string).strip() try: return FIN_STMT_ITEMS[item_name] except KeyError: log.warning(f"Unknown item name '{item_name}' found in financial statement.") raise UnknownFinancialStatementItem def find_relevant_table_rows(soup: BeautifulSoup) -> List[Tag]: """ Returns the table rows containing the data of interest. """ now = datetime.utcnow() trs = [] for tr in soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr'): try: if is_relevant_table_row(tr): trs.append(tr) except UnknownFinancialStatementItem: if DEV_MODE: with open(f'mwfin_unknown_items_{now.strftime("%Y-%m-%d_%H-%M-%S")}.html', 'w') as f: f.write(str(soup)) return trs def extract_row_data(tr: Tag) -> tuple[str, tuple[int]]: """ Returns the name of the item displayed in the table row (of a financial statement) as well as a number for each reporting period. """ item_name = str(tr.td.div.string).strip() data_div = tr.find_all('td')[-1].div.div values_str: str = data_div.attrs['data-chart-data'] values = tuple(int(float(s if s != '' else 0)) for s in values_str.split(',')) return item_name, values def extract_all_data(soup: BeautifulSoup) -> ResultDict: """ Extracts financials from the page. """ output = {END_DATE: extract_end_dates(soup)} for row in find_relevant_table_rows(soup): row_data = extract_row_data(row) output[row_data[0]] = row_data[1] return output @in_async_session async def _get_single_company_fin_stmt(statement: str, ticker_symbol: str, quarterly: bool = False, session: ClientSession = None) -> ResultDict: """ Returns data from the specified financial statement of the specified company. """ log.info(f"Scraping {statement} for {ticker_symbol}") url = f'{BASE_URL}/{ticker_symbol}/financials{FIN_STMT_URL_SUFFIX[statement]}' if quarterly: url += '/quarter' soup = await soup_from_url(url, session) return extract_all_data(soup) @in_async_session async def _get_multi_companies_fin_stmt(statement: str, *ticker_symbols: str, quarterly: bool = False, concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE, session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]: if len(ticker_symbols) == 1: return await _get_single_company_fin_stmt(statement, ticker_symbols[0], quarterly, session) coroutines = (_get_single_company_fin_stmt(statement, symbol, quarterly, session) for symbol in ticker_symbols) result_list = await gather_in_batches(concurrent_batch_size, *coroutines) return {symbol: data for symbol, data in zip(ticker_symbols, result_list)} @in_async_session async def get_balance_sheet(*ticker_symbols: str, quarterly: bool = False, concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE, session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]: """ Returns data from the balance sheet of the specified company. """ return await _get_multi_companies_fin_stmt(BS, *ticker_symbols, quarterly=quarterly, concurrent_batch_size=concurrent_batch_size, session=session) @in_async_session async def get_income_statement(*ticker_symbols: str, quarterly: bool = False, concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE, session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]: """ Returns data from the income statement of the specified company. """ return await _get_multi_companies_fin_stmt(IS, *ticker_symbols, quarterly=quarterly, concurrent_batch_size=concurrent_batch_size, session=session) @in_async_session async def get_cash_flow_statement(*ticker_symbols: str, quarterly: bool = False, concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE, session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]: """ Returns data from the cash flow statement of the specified company. """ return await _get_multi_companies_fin_stmt(CF, *ticker_symbols, quarterly=quarterly, concurrent_batch_size=concurrent_batch_size, session=session) @in_async_session async def _get_single_company_all_financials(ticker_symbol: str, quarterly: bool = False, session: ClientSession = None) -> Dict[str, ResultDict]: coroutines = (_get_single_company_fin_stmt(stmt, ticker_symbol, quarterly, session) for stmt in (BS, IS, CF)) results = await asyncio.gather(*coroutines) return {stmt: data for stmt, data in zip((BS, IS, CF), results)} @in_async_session async def get_all_financials(*ticker_symbols: str, quarterly: bool = False, concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE, session: ClientSession = None) -> Union[Dict[str, ResultDict], Dict[str, Dict[str, ResultDict]]]: """ Returns all fundamentals (balance sheet, income statement and cash flow statement) of the specified company. """ if len(ticker_symbols) == 1: return await _get_single_company_all_financials(ticker_symbols[0], quarterly, session) coroutines = (_get_single_company_all_financials(symbol, quarterly, session) for symbol in ticker_symbols) result_list = await gather_in_batches(concurrent_batch_size, *coroutines) return {symbol: data for symbol, data in zip(ticker_symbols, result_list)}