From f92123bf4a8d32225ce694f83e5108b3306a90f4 Mon Sep 17 00:00:00 2001 From: Maximilian Fajnberg Date: Thu, 11 Nov 2021 16:03:32 +0100 Subject: [PATCH] basic async capabilities added --- README.md | 2 +- requirements/common.txt | 2 +- setup.cfg | 2 +- src/stock-symbol-scraper/main.py | 34 ++++++++++++++++++++------------ 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 8c843a2..ca00eb3 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Scrapes the entire list of stocks off of [MarketWatch.com](https://www.marketwat **NOTE**: No prices or financials of any kind are collected. -At this time the scraper performs this task purely **sequentially**. Asynchronous HTTP requests to drastically speed up the process will be implemented soon. +Asynchronous HTTP requests are currently used by default. This is not intended to be run continuously since the data gathered this way will rarely change. diff --git a/requirements/common.txt b/requirements/common.txt index a151126..6b3d8f8 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -1,2 +1,2 @@ beautifulsoup4 -requests \ No newline at end of file +aiohttp \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index c078ac3..bd0f0e7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,7 +22,7 @@ packages = find: python_requires = >=3.7 install_requires = beautifulsoup4 - requests + aiohttp [options.extras_require] tests = diff --git a/src/stock-symbol-scraper/main.py b/src/stock-symbol-scraper/main.py index cd1d363..0f90c43 100644 --- a/src/stock-symbol-scraper/main.py +++ b/src/stock-symbol-scraper/main.py @@ -1,9 +1,11 @@ import logging import re +import asyncio + from datetime import datetime from string import ascii_uppercase -from requests import get +from aiohttp import ClientSession from bs4 import BeautifulSoup from bs4.element import Tag, ResultSet @@ -59,41 +61,47 @@ def get_str_from_td(td: Tag) -> str: return str(content).strip() -def all_trs_from_page(url: str) -> ResultSet: - response = get(url) - soup = BeautifulSoup(response.text, 'html.parser') +async def all_trs_from_page(url: str) -> ResultSet: + async with ClientSession() as session: + async with session.get(url) as response: + html = await response.text() + soup = BeautifulSoup(html, 'html.parser') try: return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr') except AttributeError: log.error("Unexpected HTML markup!") file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html' with open(file_name, 'w') as f: - f.write(response.text) + f.write(html) raise UnexpectedMarkupError -def all_data_from_category(category: str) -> list[row_type]: +async def all_data_from_category(category: str) -> list[row_type]: log.info(f"Getting companies starting with '{category}'") data: list[row_type] = [] page = 1 - trs = all_trs_from_page(f'{BASE_URL}{category}') + trs = await all_trs_from_page(f'{BASE_URL}{category}') while len(trs) > 0: log.info(f"Scraping page {page}") data.extend(data_from_rows(trs)) page += 1 - trs = all_trs_from_page(f'{BASE_URL}{category}/{page}') + trs = await all_trs_from_page(f'{BASE_URL}{category}/{page}') return data -def get_all_data(): - data: list[row_type] = [] - for category in CATEGORIES: - data.extend(all_data_from_category(category)) +async def get_all_data(asynchronous: bool = False) -> list[row_type]: + if asynchronous: + results = await asyncio.gather(*(all_data_from_category(category) for category in CATEGORIES)) + else: + results = [await all_data_from_category(category) for category in CATEGORIES] + data = [] + for result in results: + data.extend(result) return data def main() -> None: - data = get_all_data() + data = asyncio.run(get_all_data(True)) for tup in data: print(tup) print(len(data), 'datasets')