decoupled an async request making helper function from 'trs_from_page'
This commit is contained in:
parent
fa19f557c9
commit
19d5d1e3eb
@ -89,16 +89,34 @@ def get_str_from_td(td: Tag) -> str:
|
||||
return str(content).strip()
|
||||
|
||||
|
||||
async def trs_from_page(url: str, session: ClientSession = None, limit: int = None) -> ResultSet:
|
||||
async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSoup:
|
||||
"""
|
||||
Requests page and converts contents into a BeautifulSoup object.
|
||||
|
||||
Args:
|
||||
url:
|
||||
URL string leading to any page with matching content.
|
||||
session (optional):
|
||||
If passed a ClientSession instance, all HTTP requests will be made using that session;
|
||||
otherwise a new one is created.
|
||||
|
||||
Returns:
|
||||
A BeautifulSoup object for further data extraction
|
||||
"""
|
||||
if session is None:
|
||||
session = ClientSession()
|
||||
async with session.get(url) as response:
|
||||
html = await response.text()
|
||||
return BeautifulSoup(html, HTML_PARSER)
|
||||
|
||||
|
||||
def trs_from_page(soup: BeautifulSoup, limit: int = None) -> ResultSet:
|
||||
"""
|
||||
Returns the table rows found on the specified page.
|
||||
|
||||
Args:
|
||||
url:
|
||||
URL string leading to a page with matching content.
|
||||
session (optional):
|
||||
If passed a ClientSession instance, all HTTP requests will be made using that session;
|
||||
otherwise a new one is created.
|
||||
soup:
|
||||
Page text to be scoured for table rows.
|
||||
limit (optional):
|
||||
Stop looking after finding this many results;
|
||||
finds all matches by default.
|
||||
@ -110,18 +128,13 @@ async def trs_from_page(url: str, session: ClientSession = None, limit: int = No
|
||||
Returns:
|
||||
A ResultSet object containing all extracted 'tr' Tag objects
|
||||
"""
|
||||
if session is None:
|
||||
session = ClientSession()
|
||||
async with session.get(url) as response:
|
||||
html = await response.text()
|
||||
soup = BeautifulSoup(html, HTML_PARSER)
|
||||
try:
|
||||
return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr', limit=limit)
|
||||
except AttributeError:
|
||||
log.error("Unexpected HTML markup!")
|
||||
file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html'
|
||||
with open(file_name, 'w') as f:
|
||||
f.write(html)
|
||||
f.write(soup.prettify())
|
||||
raise UnexpectedMarkupError
|
||||
|
||||
|
||||
@ -149,12 +162,14 @@ async def get_data_from_category(category: str, session: ClientSession = None,
|
||||
session = ClientSession()
|
||||
data: list[row_type] = []
|
||||
page = first_page
|
||||
trs = await trs_from_page(f'{BASE_URL}{category}', session)
|
||||
soup = await soup_from_url(f'{BASE_URL}{category}', session)
|
||||
trs = trs_from_page(soup)
|
||||
while page <= last_page and len(trs) > 0:
|
||||
data.extend(extract_row_data(*trs))
|
||||
log.info(f"Scraped '{category}' page {page}")
|
||||
page += 1
|
||||
trs = await trs_from_page(f'{BASE_URL}{category}/{page}', session)
|
||||
soup = await soup_from_url(f'{BASE_URL}{category}/{page}', session)
|
||||
trs = trs_from_page(soup)
|
||||
return data
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user