basic async capabilities added

This commit is contained in:
Maximilian Fajnberg 2021-11-11 16:03:32 +01:00
parent 59530b5913
commit f92123bf4a
4 changed files with 24 additions and 16 deletions

View File

@ -4,7 +4,7 @@ Scrapes the entire list of stocks off of [MarketWatch.com](https://www.marketwat
**NOTE**: No prices or financials of any kind are collected.
At this time the scraper performs this task purely **sequentially**. Asynchronous HTTP requests to drastically speed up the process will be implemented soon.
Asynchronous HTTP requests are currently used by default.
This is not intended to be run continuously since the data gathered this way will rarely change.

View File

@ -1,2 +1,2 @@
beautifulsoup4
requests
aiohttp

View File

@ -22,7 +22,7 @@ packages = find:
python_requires = >=3.7
install_requires =
beautifulsoup4
requests
aiohttp
[options.extras_require]
tests =

View File

@ -1,9 +1,11 @@
import logging
import re
import asyncio
from datetime import datetime
from string import ascii_uppercase
from requests import get
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from bs4.element import Tag, ResultSet
@ -59,41 +61,47 @@ def get_str_from_td(td: Tag) -> str:
return str(content).strip()
def all_trs_from_page(url: str) -> ResultSet:
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')
async def all_trs_from_page(url: str) -> ResultSet:
async with ClientSession() as session:
async with session.get(url) as response:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
try:
return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr')
except AttributeError:
log.error("Unexpected HTML markup!")
file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html'
with open(file_name, 'w') as f:
f.write(response.text)
f.write(html)
raise UnexpectedMarkupError
def all_data_from_category(category: str) -> list[row_type]:
async def all_data_from_category(category: str) -> list[row_type]:
log.info(f"Getting companies starting with '{category}'")
data: list[row_type] = []
page = 1
trs = all_trs_from_page(f'{BASE_URL}{category}')
trs = await all_trs_from_page(f'{BASE_URL}{category}')
while len(trs) > 0:
log.info(f"Scraping page {page}")
data.extend(data_from_rows(trs))
page += 1
trs = all_trs_from_page(f'{BASE_URL}{category}/{page}')
trs = await all_trs_from_page(f'{BASE_URL}{category}/{page}')
return data
def get_all_data():
data: list[row_type] = []
for category in CATEGORIES:
data.extend(all_data_from_category(category))
async def get_all_data(asynchronous: bool = False) -> list[row_type]:
if asynchronous:
results = await asyncio.gather(*(all_data_from_category(category) for category in CATEGORIES))
else:
results = [await all_data_from_category(category) for category in CATEGORIES]
data = []
for result in results:
data.extend(result)
return data
def main() -> None:
data = get_all_data()
data = asyncio.run(get_all_data(True))
for tup in data:
print(tup)
print(len(data), 'datasets')