diff --git a/src/stock-symbol-scraper/main.py b/src/stock-symbol-scraper/main.py index 5aa4b4e..0f987a6 100644 --- a/src/stock-symbol-scraper/main.py +++ b/src/stock-symbol-scraper/main.py @@ -36,10 +36,30 @@ class UnexpectedMarkupError(Exception): def extract_row_data(*table_rows: Tag) -> list[row_type]: + """ + Iterates over any number of table rows to extract data from them. + + Args: + table_rows: + Arbitrary number of 'tr' Tag objects to be processed for data. + + Returns: + A list of 5-tuples (of string elements) + """ return [get_single_tr_data(tr) for tr in table_rows] def get_single_tr_data(table_row: Tag) -> row_type: + """ + Returns the data from a table row. + + Args: + table_row: + Specific 'tr' Tag object to be processed for data. + + Returns: + A 5-tuple of string elements + """ tds = table_row.find_all('td') company_name = str(tds[0].a.contents[0]).strip() stock_symbol = str(tds[0].a.contents[1].contents[0]).strip() @@ -55,6 +75,16 @@ def get_single_tr_data(table_row: Tag) -> row_type: def get_str_from_td(td: Tag) -> str: + """ + Returns content of a 'td' Tag object as a string. The only content has to be a NavigableString object. + + Args: + td: + The table cell to be converted into a string. + + Returns: + String content from a cell + """ try: content = td.contents[0] except IndexError: @@ -63,6 +93,26 @@ def get_str_from_td(td: Tag) -> str: async def trs_from_page(url: str, session: ClientSession = None, limit: int = None) -> ResultSet: + """ + Returns the table rows found on the specified page. + + Args: + url: + URL string leading to a page with matching content. + session (optional): + If passed a ClientSession instance, all HTTP requests will be made using that session; + otherwise a new one is created. + limit (optional): + Stop looking after finding this many results; + finds all matches by default. + + Raises: + UnexpectedMarkupError: + If no table or table body are found. + + Returns: + A ResultSet object containing all extracted 'tr' Tag objects + """ if session is None: session = ClientSession() async with session.get(url) as response: @@ -112,6 +162,17 @@ async def get_data_from_category(category: str, session: ClientSession = None, async def get_all_data(sequential: bool = False) -> list[row_type]: + """ + Returns a list with every available data row from all categories. + + Args: + sequential (optional): + Whether or not to forgo the asynchronous gathering capabilities; + by default requests are issued concurrently. + + Returns: + A list of 5-tuples (of strings) + """ async with ClientSession() as session: if sequential: results = [await get_data_from_category(category, session) for category in CATEGORIES]