basic async capabilities added

2021-11-11 16:03:32 +01:00
parent 59530b5913
commit f92123bf4a
4 changed files with 24 additions and 16 deletions
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ Scrapes the entire list of stocks off of [MarketWatch.com](https://www.marketwat
 **NOTE**: No prices or financials of any kind are collected.
-At this time the scraper performs this task purely **sequentially**. Asynchronous HTTP requests to drastically speed up the process will be implemented soon.
+Asynchronous HTTP requests are currently used by default.
 This is not intended to be run continuously since the data gathered this way will rarely change.
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -1,2 +1,2 @@
 beautifulsoup4
-requests
+aiohttp
--- a/setup.cfg
+++ b/setup.cfg
@@ -22,7 +22,7 @@ packages = find:
 python_requires = >=3.7
 install_requires =
    beautifulsoup4
-    requests
+    aiohttp
 [options.extras_require]
 tests =
--- a/src/stock-symbol-scraper/main.py
+++ b/src/stock-symbol-scraper/main.py
@@ -1,9 +1,11 @@
 import logging
 import re
 import asyncio
 from datetime import datetime
 from string import ascii_uppercase
-from requests import get
+from aiohttp import ClientSession
 from bs4 import BeautifulSoup
 from bs4.element import Tag, ResultSet
@@ -59,41 +61,47 @@ def get_str_from_td(td: Tag) -> str:
    return str(content).strip()
-def all_trs_from_page(url: str) -> ResultSet:
+async def all_trs_from_page(url: str) -> ResultSet:
-    response = get(url)
+    async with ClientSession() as session:
-    soup = BeautifulSoup(response.text, 'html.parser')
+        async with session.get(url) as response:
            html = await response.text()
    soup = BeautifulSoup(html, 'html.parser')
    try:
        return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr')
    except AttributeError:
        log.error("Unexpected HTML markup!")
        file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html'
        with open(file_name, 'w') as f:
-            f.write(response.text)
+            f.write(html)
        raise UnexpectedMarkupError
-def all_data_from_category(category: str) -> list[row_type]:
+async def all_data_from_category(category: str) -> list[row_type]:
    log.info(f"Getting companies starting with '{category}'")
    data: list[row_type] = []
    page = 1
-    trs = all_trs_from_page(f'{BASE_URL}{category}')
+    trs = await all_trs_from_page(f'{BASE_URL}{category}')
    while len(trs) > 0:
        log.info(f"Scraping page {page}")
        data.extend(data_from_rows(trs))
        page += 1
-        trs = all_trs_from_page(f'{BASE_URL}{category}/{page}')
+        trs = await all_trs_from_page(f'{BASE_URL}{category}/{page}')
    return data
-def get_all_data():
+async def get_all_data(asynchronous: bool = False) -> list[row_type]:
-    data: list[row_type] = []
+    if asynchronous:
-    for category in CATEGORIES:
+        results = await asyncio.gather(*(all_data_from_category(category) for category in CATEGORIES))
-        data.extend(all_data_from_category(category))
+    else:
        results = [await all_data_from_category(category) for category in CATEGORIES]
    data = []
    for result in results:
        data.extend(result)
    return data
 def main() -> None:
-    data = get_all_data()
+    data = asyncio.run(get_all_data(True))
    for tup in data:
        print(tup)
    print(len(data), 'datasets')
`@@ -1,2 +1,2 @@`
	`beautifulsoup4`	`beautifulsoup4`
	`requests`	`aiohttp`