decoupled an async request making helper function from 'trs_from_page'

This commit is contained in:
Maximilian Fajnberg 2021-11-13 20:08:43 +01:00
parent fa19f557c9
commit 19d5d1e3eb
1 changed files with 29 additions and 14 deletions

View File

@ -89,16 +89,34 @@ def get_str_from_td(td: Tag) -> str:
return str(content).strip()
async def trs_from_page(url: str, session: ClientSession = None, limit: int = None) -> ResultSet:
async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSoup:
"""
Requests page and converts contents into a BeautifulSoup object.
Args:
url:
URL string leading to any page with matching content.
session (optional):
If passed a ClientSession instance, all HTTP requests will be made using that session;
otherwise a new one is created.
Returns:
A BeautifulSoup object for further data extraction
"""
if session is None:
session = ClientSession()
async with session.get(url) as response:
html = await response.text()
return BeautifulSoup(html, HTML_PARSER)
def trs_from_page(soup: BeautifulSoup, limit: int = None) -> ResultSet:
"""
Returns the table rows found on the specified page.
Args:
url:
URL string leading to a page with matching content.
session (optional):
If passed a ClientSession instance, all HTTP requests will be made using that session;
otherwise a new one is created.
soup:
Page text to be scoured for table rows.
limit (optional):
Stop looking after finding this many results;
finds all matches by default.
@ -110,18 +128,13 @@ async def trs_from_page(url: str, session: ClientSession = None, limit: int = No
Returns:
A ResultSet object containing all extracted 'tr' Tag objects
"""
if session is None:
session = ClientSession()
async with session.get(url) as response:
html = await response.text()
soup = BeautifulSoup(html, HTML_PARSER)
try:
return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr', limit=limit)
except AttributeError:
log.error("Unexpected HTML markup!")
file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html'
with open(file_name, 'w') as f:
f.write(html)
f.write(soup.prettify())
raise UnexpectedMarkupError
@ -149,12 +162,14 @@ async def get_data_from_category(category: str, session: ClientSession = None,
session = ClientSession()
data: list[row_type] = []
page = first_page
trs = await trs_from_page(f'{BASE_URL}{category}', session)
soup = await soup_from_url(f'{BASE_URL}{category}', session)
trs = trs_from_page(soup)
while page <= last_page and len(trs) > 0:
data.extend(extract_row_data(*trs))
log.info(f"Scraped '{category}' page {page}")
page += 1
trs = await trs_from_page(f'{BASE_URL}{category}/{page}', session)
soup = await soup_from_url(f'{BASE_URL}{category}/{page}', session)
trs = trs_from_page(soup)
return data