decoupled an async request making helper function from 'trs_from_page'

2021-11-13 20:08:43 +01:00 · 2021-11-13 20:08:43 +01:00 · 19d5d1e3eb
commit 19d5d1e3eb
parent fa19f557c9
1 changed files with 29 additions and 14 deletions
--- a/src/stocksymbolscraper/scrape.py
+++ b/src/stocksymbolscraper/scrape.py
@ -89,16 +89,34 @@ def get_str_from_td(td: Tag) -> str:
    return str(content).strip()
-async def trs_from_page(url: str, session: ClientSession = None, limit: int = None) -> ResultSet:
+async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSoup:
    """
    Requests page and converts contents into a BeautifulSoup object.
    Args:
        url:
            URL string leading to any page with matching content.
        session (optional):
            If passed a ClientSession instance, all HTTP requests will be made using that session;
            otherwise a new one is created.
    Returns:
        A BeautifulSoup object for further data extraction
    """
    if session is None:
        session = ClientSession()
    async with session.get(url) as response:
        html = await response.text()
    return BeautifulSoup(html, HTML_PARSER)
 def trs_from_page(soup: BeautifulSoup, limit: int = None) -> ResultSet:
    """
    Returns the table rows found on the specified page.
    Args:
-        url:
+        soup:
-            URL string leading to a page with matching content.
+            Page text to be scoured for table rows.
        session (optional):
            If passed a ClientSession instance, all HTTP requests will be made using that session;
            otherwise a new one is created.
        limit (optional):
            Stop looking after finding this many results;
            finds all matches by default.
@ -110,18 +128,13 @@ async def trs_from_page(url: str, session: ClientSession = None, limit: int = No
    Returns:
        A ResultSet object containing all extracted 'tr' Tag objects
    """
    if session is None:
        session = ClientSession()
    async with session.get(url) as response:
        html = await response.text()
    soup = BeautifulSoup(html, HTML_PARSER)
    try:
        return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr', limit=limit)
    except AttributeError:
        log.error("Unexpected HTML markup!")
        file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html'
        with open(file_name, 'w') as f:
-            f.write(html)
+            f.write(soup.prettify())
        raise UnexpectedMarkupError
@ -149,12 +162,14 @@ async def get_data_from_category(category: str, session: ClientSession = None,
        session = ClientSession()
    data: list[row_type] = []
    page = first_page
-    trs = await trs_from_page(f'{BASE_URL}{category}', session)
+    soup = await soup_from_url(f'{BASE_URL}{category}', session)
    trs = trs_from_page(soup)
    while page <= last_page and len(trs) > 0:
        data.extend(extract_row_data(*trs))
        log.info(f"Scraped '{category}' page {page}")
        page += 1
-        trs = await trs_from_page(f'{BASE_URL}{category}/{page}', session)
+        soup = await soup_from_url(f'{BASE_URL}{category}/{page}', session)
        trs = trs_from_page(soup)
    return data