only use one http session in the main function
This commit is contained in:
parent
f92123bf4a
commit
323144200b
@ -61,10 +61,11 @@ def get_str_from_td(td: Tag) -> str:
|
|||||||
return str(content).strip()
|
return str(content).strip()
|
||||||
|
|
||||||
|
|
||||||
async def all_trs_from_page(url: str) -> ResultSet:
|
async def all_trs_from_page(url: str, session: ClientSession = None) -> ResultSet:
|
||||||
async with ClientSession() as session:
|
if session is None:
|
||||||
async with session.get(url) as response:
|
session = ClientSession()
|
||||||
html = await response.text()
|
async with session.get(url) as response:
|
||||||
|
html = await response.text()
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
try:
|
try:
|
||||||
return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr')
|
return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr')
|
||||||
@ -76,24 +77,27 @@ async def all_trs_from_page(url: str) -> ResultSet:
|
|||||||
raise UnexpectedMarkupError
|
raise UnexpectedMarkupError
|
||||||
|
|
||||||
|
|
||||||
async def all_data_from_category(category: str) -> list[row_type]:
|
async def all_data_from_category(category: str, session: ClientSession = None) -> list[row_type]:
|
||||||
log.info(f"Getting companies starting with '{category}'")
|
log.info(f"Getting companies starting with '{category}'")
|
||||||
|
if session is None:
|
||||||
|
session = ClientSession()
|
||||||
data: list[row_type] = []
|
data: list[row_type] = []
|
||||||
page = 1
|
page = 1
|
||||||
trs = await all_trs_from_page(f'{BASE_URL}{category}')
|
trs = await all_trs_from_page(f'{BASE_URL}{category}', session)
|
||||||
while len(trs) > 0:
|
while len(trs) > 0:
|
||||||
log.info(f"Scraping page {page}")
|
log.info(f"Scraping '{category}' page {page}")
|
||||||
data.extend(data_from_rows(trs))
|
data.extend(data_from_rows(trs))
|
||||||
page += 1
|
page += 1
|
||||||
trs = await all_trs_from_page(f'{BASE_URL}{category}/{page}')
|
trs = await all_trs_from_page(f'{BASE_URL}{category}/{page}', session)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
async def get_all_data(asynchronous: bool = False) -> list[row_type]:
|
async def get_all_data(asynchronous: bool = False) -> list[row_type]:
|
||||||
if asynchronous:
|
async with ClientSession() as session:
|
||||||
results = await asyncio.gather(*(all_data_from_category(category) for category in CATEGORIES))
|
if asynchronous:
|
||||||
else:
|
results = await asyncio.gather(*(all_data_from_category(category, session) for category in CATEGORIES))
|
||||||
results = [await all_data_from_category(category) for category in CATEGORIES]
|
else:
|
||||||
|
results = [await all_data_from_category(category, session) for category in CATEGORIES]
|
||||||
data = []
|
data = []
|
||||||
for result in results:
|
for result in results:
|
||||||
data.extend(result)
|
data.extend(result)
|
||||||
|
Loading…
Reference in New Issue
Block a user