only use one http session in the main function
This commit is contained in:
parent
f92123bf4a
commit
323144200b
@ -61,8 +61,9 @@ def get_str_from_td(td: Tag) -> str:
|
||||
return str(content).strip()
|
||||
|
||||
|
||||
async def all_trs_from_page(url: str) -> ResultSet:
|
||||
async with ClientSession() as session:
|
||||
async def all_trs_from_page(url: str, session: ClientSession = None) -> ResultSet:
|
||||
if session is None:
|
||||
session = ClientSession()
|
||||
async with session.get(url) as response:
|
||||
html = await response.text()
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
@ -76,24 +77,27 @@ async def all_trs_from_page(url: str) -> ResultSet:
|
||||
raise UnexpectedMarkupError
|
||||
|
||||
|
||||
async def all_data_from_category(category: str) -> list[row_type]:
|
||||
async def all_data_from_category(category: str, session: ClientSession = None) -> list[row_type]:
|
||||
log.info(f"Getting companies starting with '{category}'")
|
||||
if session is None:
|
||||
session = ClientSession()
|
||||
data: list[row_type] = []
|
||||
page = 1
|
||||
trs = await all_trs_from_page(f'{BASE_URL}{category}')
|
||||
trs = await all_trs_from_page(f'{BASE_URL}{category}', session)
|
||||
while len(trs) > 0:
|
||||
log.info(f"Scraping page {page}")
|
||||
log.info(f"Scraping '{category}' page {page}")
|
||||
data.extend(data_from_rows(trs))
|
||||
page += 1
|
||||
trs = await all_trs_from_page(f'{BASE_URL}{category}/{page}')
|
||||
trs = await all_trs_from_page(f'{BASE_URL}{category}/{page}', session)
|
||||
return data
|
||||
|
||||
|
||||
async def get_all_data(asynchronous: bool = False) -> list[row_type]:
|
||||
async with ClientSession() as session:
|
||||
if asynchronous:
|
||||
results = await asyncio.gather(*(all_data_from_category(category) for category in CATEGORIES))
|
||||
results = await asyncio.gather(*(all_data_from_category(category, session) for category in CATEGORIES))
|
||||
else:
|
||||
results = [await all_data_from_category(category) for category in CATEGORIES]
|
||||
results = [await all_data_from_category(category, session) for category in CATEGORIES]
|
||||
data = []
|
||||
for result in results:
|
||||
data.extend(result)
|
||||
|
Loading…
Reference in New Issue
Block a user