basic async capabilities added

This commit is contained in:
2021-11-11 16:03:32 +01:00
parent 59530b5913
commit f92123bf4a
4 changed files with 24 additions and 16 deletions

View File

@ -1,9 +1,11 @@
import logging
import re
import asyncio
from datetime import datetime
from string import ascii_uppercase
from requests import get
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from bs4.element import Tag, ResultSet
@ -59,41 +61,47 @@ def get_str_from_td(td: Tag) -> str:
return str(content).strip()
def all_trs_from_page(url: str) -> ResultSet:
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')
async def all_trs_from_page(url: str) -> ResultSet:
async with ClientSession() as session:
async with session.get(url) as response:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
try:
return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr')
except AttributeError:
log.error("Unexpected HTML markup!")
file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html'
with open(file_name, 'w') as f:
f.write(response.text)
f.write(html)
raise UnexpectedMarkupError
def all_data_from_category(category: str) -> list[row_type]:
async def all_data_from_category(category: str) -> list[row_type]:
log.info(f"Getting companies starting with '{category}'")
data: list[row_type] = []
page = 1
trs = all_trs_from_page(f'{BASE_URL}{category}')
trs = await all_trs_from_page(f'{BASE_URL}{category}')
while len(trs) > 0:
log.info(f"Scraping page {page}")
data.extend(data_from_rows(trs))
page += 1
trs = all_trs_from_page(f'{BASE_URL}{category}/{page}')
trs = await all_trs_from_page(f'{BASE_URL}{category}/{page}')
return data
def get_all_data():
data: list[row_type] = []
for category in CATEGORIES:
data.extend(all_data_from_category(category))
async def get_all_data(asynchronous: bool = False) -> list[row_type]:
if asynchronous:
results = await asyncio.gather(*(all_data_from_category(category) for category in CATEGORIES))
else:
results = [await all_data_from_category(category) for category in CATEGORIES]
data = []
for result in results:
data.extend(result)
return data
def main() -> None:
data = get_all_data()
data = asyncio.run(get_all_data(True))
for tup in data:
print(tup)
print(len(data), 'datasets')