only use one http session in the main function

This commit is contained in:
Daniil Fajnberg 2021-11-11 16:21:34 +01:00
parent f92123bf4a
commit 323144200b

View File

@ -61,8 +61,9 @@ def get_str_from_td(td: Tag) -> str:
return str(content).strip()
async def all_trs_from_page(url: str) -> ResultSet:
async with ClientSession() as session:
async def all_trs_from_page(url: str, session: ClientSession = None) -> ResultSet:
if session is None:
session = ClientSession()
async with session.get(url) as response:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
@ -76,24 +77,27 @@ async def all_trs_from_page(url: str) -> ResultSet:
raise UnexpectedMarkupError
async def all_data_from_category(category: str) -> list[row_type]:
async def all_data_from_category(category: str, session: ClientSession = None) -> list[row_type]:
log.info(f"Getting companies starting with '{category}'")
if session is None:
session = ClientSession()
data: list[row_type] = []
page = 1
trs = await all_trs_from_page(f'{BASE_URL}{category}')
trs = await all_trs_from_page(f'{BASE_URL}{category}', session)
while len(trs) > 0:
log.info(f"Scraping page {page}")
log.info(f"Scraping '{category}' page {page}")
data.extend(data_from_rows(trs))
page += 1
trs = await all_trs_from_page(f'{BASE_URL}{category}/{page}')
trs = await all_trs_from_page(f'{BASE_URL}{category}/{page}', session)
return data
async def get_all_data(asynchronous: bool = False) -> list[row_type]:
async with ClientSession() as session:
if asynchronous:
results = await asyncio.gather(*(all_data_from_category(category) for category in CATEGORIES))
results = await asyncio.gather(*(all_data_from_category(category, session) for category in CATEGORIES))
else:
results = [await all_data_from_category(category) for category in CATEGORIES]
results = [await all_data_from_category(category, session) for category in CATEGORIES]
data = []
for result in results:
data.extend(result)