decoupled an async request making helper function from 'trs_from_page'
This commit is contained in:
parent
fa19f557c9
commit
19d5d1e3eb
@ -89,16 +89,34 @@ def get_str_from_td(td: Tag) -> str:
|
|||||||
return str(content).strip()
|
return str(content).strip()
|
||||||
|
|
||||||
|
|
||||||
async def trs_from_page(url: str, session: ClientSession = None, limit: int = None) -> ResultSet:
|
async def soup_from_url(url: str, session: ClientSession = None) -> BeautifulSoup:
|
||||||
|
"""
|
||||||
|
Requests page and converts contents into a BeautifulSoup object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url:
|
||||||
|
URL string leading to any page with matching content.
|
||||||
|
session (optional):
|
||||||
|
If passed a ClientSession instance, all HTTP requests will be made using that session;
|
||||||
|
otherwise a new one is created.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A BeautifulSoup object for further data extraction
|
||||||
|
"""
|
||||||
|
if session is None:
|
||||||
|
session = ClientSession()
|
||||||
|
async with session.get(url) as response:
|
||||||
|
html = await response.text()
|
||||||
|
return BeautifulSoup(html, HTML_PARSER)
|
||||||
|
|
||||||
|
|
||||||
|
def trs_from_page(soup: BeautifulSoup, limit: int = None) -> ResultSet:
|
||||||
"""
|
"""
|
||||||
Returns the table rows found on the specified page.
|
Returns the table rows found on the specified page.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url:
|
soup:
|
||||||
URL string leading to a page with matching content.
|
Page text to be scoured for table rows.
|
||||||
session (optional):
|
|
||||||
If passed a ClientSession instance, all HTTP requests will be made using that session;
|
|
||||||
otherwise a new one is created.
|
|
||||||
limit (optional):
|
limit (optional):
|
||||||
Stop looking after finding this many results;
|
Stop looking after finding this many results;
|
||||||
finds all matches by default.
|
finds all matches by default.
|
||||||
@ -110,18 +128,13 @@ async def trs_from_page(url: str, session: ClientSession = None, limit: int = No
|
|||||||
Returns:
|
Returns:
|
||||||
A ResultSet object containing all extracted 'tr' Tag objects
|
A ResultSet object containing all extracted 'tr' Tag objects
|
||||||
"""
|
"""
|
||||||
if session is None:
|
|
||||||
session = ClientSession()
|
|
||||||
async with session.get(url) as response:
|
|
||||||
html = await response.text()
|
|
||||||
soup = BeautifulSoup(html, HTML_PARSER)
|
|
||||||
try:
|
try:
|
||||||
return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr', limit=limit)
|
return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr', limit=limit)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
log.error("Unexpected HTML markup!")
|
log.error("Unexpected HTML markup!")
|
||||||
file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html'
|
file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html'
|
||||||
with open(file_name, 'w') as f:
|
with open(file_name, 'w') as f:
|
||||||
f.write(html)
|
f.write(soup.prettify())
|
||||||
raise UnexpectedMarkupError
|
raise UnexpectedMarkupError
|
||||||
|
|
||||||
|
|
||||||
@ -149,12 +162,14 @@ async def get_data_from_category(category: str, session: ClientSession = None,
|
|||||||
session = ClientSession()
|
session = ClientSession()
|
||||||
data: list[row_type] = []
|
data: list[row_type] = []
|
||||||
page = first_page
|
page = first_page
|
||||||
trs = await trs_from_page(f'{BASE_URL}{category}', session)
|
soup = await soup_from_url(f'{BASE_URL}{category}', session)
|
||||||
|
trs = trs_from_page(soup)
|
||||||
while page <= last_page and len(trs) > 0:
|
while page <= last_page and len(trs) > 0:
|
||||||
data.extend(extract_row_data(*trs))
|
data.extend(extract_row_data(*trs))
|
||||||
log.info(f"Scraped '{category}' page {page}")
|
log.info(f"Scraped '{category}' page {page}")
|
||||||
page += 1
|
page += 1
|
||||||
trs = await trs_from_page(f'{BASE_URL}{category}/{page}', session)
|
soup = await soup_from_url(f'{BASE_URL}{category}/{page}', session)
|
||||||
|
trs = trs_from_page(soup)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user