mwfin/src/mwfin/functions.py

import logging
import asyncio
from typing import Union, Tuple, Dict

from aiohttp.client import ClientSession
from bs4 import BeautifulSoup
from bs4.element import Tag, ResultSet
from webutils import in_async_session, gather_in_batches

from .constants import (HTML_PARSER, BASE_URL, END_DATE, BS, IS, CF, FIN_STMT_URL_SUFFIX, DEFAULT_CONCURRENT_BATCH_SIZE,
                        INDENT_MAP)


log = logging.getLogger(__name__)

# First element in each Tuple is an integer indicating the row indent
HeaderData = Tuple[int, str, str, str, str, str]
RowData = Tuple[int, float, float, float, float, float]
# The resulting dictionary's keys correspond to the name of the item (row) in the financial statement.
# The first value is a tuple of the end dates of the reporting periods as strings (see above).
# The other values are the actual data tuples containing the financial figures.
ResultDict = dict[str, Union[HeaderData, RowData]]


@in_async_session
async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSoup:
    """
    Requests a web page and turns the response text into BeautifulSoup.

    Args:
        url:
            The GET request is sent to this URL
        session (optional):
            If passed an `aiohttp.ClientSession` object, it will be used to perform the request.
            Otherwise a new session is created and automatically closed after the request (see `@in_async_session`).

    Returns:
        The parsed html response text as BeautifulSoup
    """
    async with session.get(url) as response:
        html = await response.text()
    return BeautifulSoup(html, HTML_PARSER)


def get_row_indent(tr: Tag) -> int:
    """
    Determines the visual indent of a table row.
    Some positions in a financial statement have sub-positions below them indicated by indentation of the text in the
    position name's cell.

    Args:
        tr: The table row element

    Returns:
        Each indentation level corresponds to an integer. 0 = no indentation, 1 = small, 2 = medium, 3 = large
    """
    try:
        classes = tr.div.attrs['class']
    except KeyError:
        return 0
    for class_name, indent in INDENT_MAP.items():
        if class_name in classes:
            return indent
    return 0


def extract_end_dates(soup: BeautifulSoup) -> HeaderData:
    """
    Finds and returns the end dates of the reporting periods as strings (either years or quarters) from the page of a
    financial statement.

    Args:
        soup: The parsed page containing the financial statement

    Returns:
        A 6-tuple, the first element being the indent (in this case 0) and the rest being the actual end dates.
    """
    tr = soup.find('div', attrs={'class': 'financials'}).thead.tr
    ths = tr.find_all('th')
    return (0, ) + tuple(str(th.string).strip() for th in ths[1:-1])


def get_all_table_rows(soup: BeautifulSoup) -> ResultSet:
    """
    Returns the table rows containing the data of interest.

    Args:
        soup: The parsed page containing the financial statement

    Returns:
        All table rows containing data from the financial statement
    """
    return soup.find('div', attrs={'class': 'financials'}).tbody.find_all('tr')


def extract_row_data(tr: Tag) -> Tuple[str, RowData]:
    """
    Returns the name of the item displayed in the table row (of a financial statement)
    as well as the position's indent and a figure for each reporting period.

    Args:
        tr: A table row containing data from a financial statement

    Returns:
        2-tuple where the 1st element is the position's name and the second is a 6-tuple, of which the first element is
        the indent and the rest are the actual figures.
    """
    item_name = str(tr.td.div.string).strip()
    data_div = tr.find_all('td')[-1].div.div
    values_str: str = data_div.attrs['data-chart-data']
    values = tuple(float(s if s != '' else 0) for s in values_str.split(','))
    return item_name, (get_row_indent(tr), ) + values


def extract_all_data(soup: BeautifulSoup) -> ResultDict:
    """
    Extracts financials from the page, which can contain either a balance sheet, income statement or cash flow
    statement.

    Args:
        soup: The parsed page containing a financial statement

    Returns:
        Custom result dictionary (see `ResultDict`)
    """
    output = {END_DATE: extract_end_dates(soup)}
    for row in get_all_table_rows(soup):
        row_data = extract_row_data(row)
        output[row_data[0]] = row_data[1]
    return output


@in_async_session
async def get_single_company_fin_stmt(statement: str, ticker_symbol: str, quarterly: bool = False,
                                      session: ClientSession = None) -> ResultDict:
    """
    Returns data from the specified financial statement of the specified company.

    Args:
        statement:
            Must be one of the strings defined in the constants `BS`, `IS`, `CF`
        ticker_symbol:
            The company's stock ticker symbol
        quarterly (optional):
            If true the financial data of the last five quarters is scraped; otherwise (default) the last five years.
        session (optional):
            See `soup_from_url`

    Returns:
        Custom result dictionary (see `ResultDict`)
    """
    log.info(f"Scraping {statement} for {ticker_symbol}")
    url = f'{BASE_URL}/{ticker_symbol}/financials{FIN_STMT_URL_SUFFIX[statement]}'
    if quarterly:
        url += '/quarter'
    soup = await soup_from_url(url, session)
    return extract_all_data(soup)


@in_async_session
async def get_multi_companies_fin_stmt(statement: str, *ticker_symbols: str, quarterly: bool = False,
                                       concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
                                       session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
    """
    Returns data from the specified financial statement of the specified companies.

    Args:
        statement:
            See `get_single_company_fin_stmt`
        ticker_symbols:
            Arbitrary number of companies' stock ticker symbols
        quarterly (optional):
            See `get_single_company_fin_stmt`
        concurrent_batch_size (optional):
            If multiple ticker symbols are passed, the company financials can be scraped concurrently.
            This argument determines how many companies are scraped concurrently.
            By default, they are scraped sequentially (i.e. a batch size of 1).
        session (optional):
            See `get_single_company_fin_stmt`

    Returns:
        If only one ticker symbol is passed, the `ResultDict` for that financial statement is returned. If multiple
        symbols are passed, a dictionary is returned, where the keys are the symbols and the values are the
        corresponding `ResultDict`s.
    """
    if len(ticker_symbols) == 1:
        return await get_single_company_fin_stmt(statement, ticker_symbols[0], quarterly, session)
    coroutines = (get_single_company_fin_stmt(statement, symbol, quarterly, session) for symbol in ticker_symbols)
    result_list = await gather_in_batches(concurrent_batch_size, *coroutines)
    return {symbol: data for symbol, data in zip(ticker_symbols, result_list)}


@in_async_session
async def get_balance_sheet(*ticker_symbols: str, quarterly: bool = False,
                            concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
                            session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
    """
    Returns data from the balance sheet of the specified companies.
    Convenience function around `get_multi_companies_fin_stmt`
    """
    return await get_multi_companies_fin_stmt(BS, *ticker_symbols,
                                              quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
                                              session=session)


@in_async_session
async def get_income_statement(*ticker_symbols: str, quarterly: bool = False,
                               concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
                               session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
    """
    Returns data from the income statement of the specified companies.
    Convenience function around `get_multi_companies_fin_stmt`
    """
    return await get_multi_companies_fin_stmt(IS, *ticker_symbols,
                                              quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
                                              session=session)


@in_async_session
async def get_cash_flow_statement(*ticker_symbols: str, quarterly: bool = False,
                                  concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
                                  session: ClientSession = None) -> Union[ResultDict, Dict[str, ResultDict]]:
    """
    Returns data from the cash flow statement of the specified companies.
    Convenience function around `get_multi_companies_fin_stmt`
    """
    return await get_multi_companies_fin_stmt(CF, *ticker_symbols,
                                              quarterly=quarterly, concurrent_batch_size=concurrent_batch_size,
                                              session=session)


@in_async_session
async def get_single_company_all_financials(ticker_symbol: str, quarterly: bool = False,
                                            session: ClientSession = None) -> Dict[str, ResultDict]:
    """
    Returns data from all financial statements of the specified company.
    Concurrently calls `get_single_company_fin_stmt` three times.

    Args:
        ticker_symbol:
            The company's stock ticker symbol
        quarterly (optional):
            See `get_single_company_fin_stmt`
        session (optional):
            See `get_single_company_fin_stmt`

    Returns:
        A dictionary where the keys are the three different statement names and the values are the
        corresponding `ResultDict`s
    """
    coroutines = (get_single_company_fin_stmt(stmt, ticker_symbol, quarterly, session) for stmt in (BS, IS, CF))
    results = await asyncio.gather(*coroutines)
    return {stmt: data for stmt, data in zip((BS, IS, CF), results)}


@in_async_session
async def get_all_financials(*ticker_symbols: str, quarterly: bool = False,
                             concurrent_batch_size: int = DEFAULT_CONCURRENT_BATCH_SIZE,
                             session: ClientSession = None) -> Union[Dict[str, ResultDict],
                                                                     Dict[str, Dict[str, ResultDict]]]:
    """
    Returns all fundamentals (balance sheet, income statement and cash flow statement) of the specified companies.

    Args:
        ticker_symbols:
            Arbitrary number of companies' stock ticker symbols
        quarterly (optional):
            See `get_single_company_all_financials`
        concurrent_batch_size (optional):
            If multiple ticker symbols are passed, the company financials can be scraped concurrently.
            This argument determines how many companies are scraped concurrently.
            By default, they are scraped sequentially (i.e. a batch size of 1).
        session (optional):
            See `get_single_company_all_financials`

    Returns:
        If only one ticker symbol is passed, the output of `get_single_company_all_financials` is returned. If multiple
        symbols are passed, a dictionary is returned, where the keys are the symbols and the values are the
        corresponding outputs of `get_single_company_all_financials`.
    """
    if len(ticker_symbols) == 1:
        return await get_single_company_all_financials(ticker_symbols[0], quarterly, session)
    coroutines = (get_single_company_all_financials(symbol, quarterly, session) for symbol in ticker_symbols)
    result_list = await gather_in_batches(concurrent_batch_size, *coroutines)
    return {symbol: data for symbol, data in zip(ticker_symbols, result_list)}