only use one http session in the main function

This commit is contained in:
Daniil Fajnberg 2021-11-11 16:21:34 +01:00
parent f92123bf4a
commit 323144200b

View File

@ -61,8 +61,9 @@ def get_str_from_td(td: Tag) -> str:
return str(content).strip() return str(content).strip()
async def all_trs_from_page(url: str) -> ResultSet: async def all_trs_from_page(url: str, session: ClientSession = None) -> ResultSet:
async with ClientSession() as session: if session is None:
session = ClientSession()
async with session.get(url) as response: async with session.get(url) as response:
html = await response.text() html = await response.text()
soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser')
@ -76,24 +77,27 @@ async def all_trs_from_page(url: str) -> ResultSet:
raise UnexpectedMarkupError raise UnexpectedMarkupError
async def all_data_from_category(category: str) -> list[row_type]: async def all_data_from_category(category: str, session: ClientSession = None) -> list[row_type]:
log.info(f"Getting companies starting with '{category}'") log.info(f"Getting companies starting with '{category}'")
if session is None:
session = ClientSession()
data: list[row_type] = [] data: list[row_type] = []
page = 1 page = 1
trs = await all_trs_from_page(f'{BASE_URL}{category}') trs = await all_trs_from_page(f'{BASE_URL}{category}', session)
while len(trs) > 0: while len(trs) > 0:
log.info(f"Scraping page {page}") log.info(f"Scraping '{category}' page {page}")
data.extend(data_from_rows(trs)) data.extend(data_from_rows(trs))
page += 1 page += 1
trs = await all_trs_from_page(f'{BASE_URL}{category}/{page}') trs = await all_trs_from_page(f'{BASE_URL}{category}/{page}', session)
return data return data
async def get_all_data(asynchronous: bool = False) -> list[row_type]: async def get_all_data(asynchronous: bool = False) -> list[row_type]:
async with ClientSession() as session:
if asynchronous: if asynchronous:
results = await asyncio.gather(*(all_data_from_category(category) for category in CATEGORIES)) results = await asyncio.gather(*(all_data_from_category(category, session) for category in CATEGORIES))
else: else:
results = [await all_data_from_category(category) for category in CATEGORIES] results = [await all_data_from_category(category, session) for category in CATEGORIES]
data = [] data = []
for result in results: for result in results:
data.extend(result) data.extend(result)