decoupled an async request making helper function from 'trs_from_page'

2021-11-13 20:08:43 +01:00
parent fa19f557c9
commit 19d5d1e3eb
1 changed files with 29 additions and 14 deletions
--- a/src/stocksymbolscraper/scrape.py
+++ b/src/stocksymbolscraper/scrape.py
@@ -89,16 +89,34 @@ def get_str_from_td(td: Tag) -> str:
    return str(content).strip()


-async def trs_from_page(url: str, session: ClientSession = None, limit: int = None) -> ResultSet:
+async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSoup:
+    """
+    Requests page and converts contents into a BeautifulSoup object.
+
+    Args:
+        url:
+            URL string leading to any page with matching content.
+        session (optional):
+            If passed a ClientSession instance, all HTTP requests will be made using that session;
+            otherwise a new one is created.
+
+    Returns:
+        A BeautifulSoup object for further data extraction
+    """
+    if session is None:
+        session = ClientSession()
+    async with session.get(url) as response:
+        html = await response.text()
+    return BeautifulSoup(html, HTML_PARSER)
+
+
+def trs_from_page(soup: BeautifulSoup, limit: int = None) -> ResultSet:
    """
    Returns the table rows found on the specified page.

    Args:
-        url:
-            URL string leading to a page with matching content.
-        session (optional):
-            If passed a ClientSession instance, all HTTP requests will be made using that session;
-            otherwise a new one is created.
+        soup:
+            Page text to be scoured for table rows.
        limit (optional):
            Stop looking after finding this many results;
            finds all matches by default.
@@ -110,18 +128,13 @@ async def trs_from_page(url: str, session: ClientSession = None, limit: int = No
    Returns:
        A ResultSet object containing all extracted 'tr' Tag objects
    """
-    if session is None:
-        session = ClientSession()
-    async with session.get(url) as response:
-        html = await response.text()
-    soup = BeautifulSoup(html, HTML_PARSER)
    try:
        return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr', limit=limit)
    except AttributeError:
        log.error("Unexpected HTML markup!")
        file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html'
        with open(file_name, 'w') as f:
-            f.write(html)
+            f.write(soup.prettify())
        raise UnexpectedMarkupError


@@ -149,12 +162,14 @@ async def get_data_from_category(category: str, session: ClientSession = None,
        session = ClientSession()
    data: list[row_type] = []
    page = first_page
-    trs = await trs_from_page(f'{BASE_URL}{category}', session)
+    soup = await soup_from_url(f'{BASE_URL}{category}', session)
+    trs = trs_from_page(soup)
    while page <= last_page and len(trs) > 0:
        data.extend(extract_row_data(*trs))
        log.info(f"Scraped '{category}' page {page}")
        page += 1
-        trs = await trs_from_page(f'{BASE_URL}{category}/{page}', session)
+        soup = await soup_from_url(f'{BASE_URL}{category}/{page}', session)
+        trs = trs_from_page(soup)
    return data