diff --git a/src/stocksymbolscraper/scrape.py b/src/stocksymbolscraper/scrape.py index b40f5cf..7c88041 100644 --- a/src/stocksymbolscraper/scrape.py +++ b/src/stocksymbolscraper/scrape.py @@ -89,16 +89,34 @@ def get_str_from_td(td: Tag) -> str: return str(content).strip() -async def trs_from_page(url: str, session: ClientSession = None, limit: int = None) -> ResultSet: +async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSoup: + """ + Requests page and converts contents into a BeautifulSoup object. + + Args: + url: + URL string leading to any page with matching content. + session (optional): + If passed a ClientSession instance, all HTTP requests will be made using that session; + otherwise a new one is created. + + Returns: + A BeautifulSoup object for further data extraction + """ + if session is None: + session = ClientSession() + async with session.get(url) as response: + html = await response.text() + return BeautifulSoup(html, HTML_PARSER) + + +def trs_from_page(soup: BeautifulSoup, limit: int = None) -> ResultSet: """ Returns the table rows found on the specified page. Args: - url: - URL string leading to a page with matching content. - session (optional): - If passed a ClientSession instance, all HTTP requests will be made using that session; - otherwise a new one is created. + soup: + Page text to be scoured for table rows. limit (optional): Stop looking after finding this many results; finds all matches by default. @@ -110,18 +128,13 @@ async def trs_from_page(url: str, session: ClientSession = None, limit: int = No Returns: A ResultSet object containing all extracted 'tr' Tag objects """ - if session is None: - session = ClientSession() - async with session.get(url) as response: - html = await response.text() - soup = BeautifulSoup(html, HTML_PARSER) try: return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr', limit=limit) except AttributeError: log.error("Unexpected HTML markup!") file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html' with open(file_name, 'w') as f: - f.write(html) + f.write(soup.prettify()) raise UnexpectedMarkupError @@ -149,12 +162,14 @@ async def get_data_from_category(category: str, session: ClientSession = None, session = ClientSession() data: list[row_type] = [] page = first_page - trs = await trs_from_page(f'{BASE_URL}{category}', session) + soup = await soup_from_url(f'{BASE_URL}{category}', session) + trs = trs_from_page(soup) while page <= last_page and len(trs) > 0: data.extend(extract_row_data(*trs)) log.info(f"Scraped '{category}' page {page}") page += 1 - trs = await trs_from_page(f'{BASE_URL}{category}/{page}', session) + soup = await soup_from_url(f'{BASE_URL}{category}/{page}', session) + trs = trs_from_page(soup) return data