basic async capabilities added

This commit is contained in:
Maximilian Fajnberg 2021-11-11 16:03:32 +01:00
parent 59530b5913
commit f92123bf4a
4 changed files with 24 additions and 16 deletions

View File

@ -4,7 +4,7 @@ Scrapes the entire list of stocks off of [MarketWatch.com](https://www.marketwat
**NOTE**: No prices or financials of any kind are collected. **NOTE**: No prices or financials of any kind are collected.
At this time the scraper performs this task purely **sequentially**. Asynchronous HTTP requests to drastically speed up the process will be implemented soon. Asynchronous HTTP requests are currently used by default.
This is not intended to be run continuously since the data gathered this way will rarely change. This is not intended to be run continuously since the data gathered this way will rarely change.

View File

@ -1,2 +1,2 @@
beautifulsoup4 beautifulsoup4
requests aiohttp

View File

@ -22,7 +22,7 @@ packages = find:
python_requires = >=3.7 python_requires = >=3.7
install_requires = install_requires =
beautifulsoup4 beautifulsoup4
requests aiohttp
[options.extras_require] [options.extras_require]
tests = tests =

View File

@ -1,9 +1,11 @@
import logging import logging
import re import re
import asyncio
from datetime import datetime from datetime import datetime
from string import ascii_uppercase from string import ascii_uppercase
from requests import get from aiohttp import ClientSession
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag, ResultSet from bs4.element import Tag, ResultSet
@ -59,41 +61,47 @@ def get_str_from_td(td: Tag) -> str:
return str(content).strip() return str(content).strip()
def all_trs_from_page(url: str) -> ResultSet: async def all_trs_from_page(url: str) -> ResultSet:
response = get(url) async with ClientSession() as session:
soup = BeautifulSoup(response.text, 'html.parser') async with session.get(url) as response:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
try: try:
return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr') return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr')
except AttributeError: except AttributeError:
log.error("Unexpected HTML markup!") log.error("Unexpected HTML markup!")
file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html' file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html'
with open(file_name, 'w') as f: with open(file_name, 'w') as f:
f.write(response.text) f.write(html)
raise UnexpectedMarkupError raise UnexpectedMarkupError
def all_data_from_category(category: str) -> list[row_type]: async def all_data_from_category(category: str) -> list[row_type]:
log.info(f"Getting companies starting with '{category}'") log.info(f"Getting companies starting with '{category}'")
data: list[row_type] = [] data: list[row_type] = []
page = 1 page = 1
trs = all_trs_from_page(f'{BASE_URL}{category}') trs = await all_trs_from_page(f'{BASE_URL}{category}')
while len(trs) > 0: while len(trs) > 0:
log.info(f"Scraping page {page}") log.info(f"Scraping page {page}")
data.extend(data_from_rows(trs)) data.extend(data_from_rows(trs))
page += 1 page += 1
trs = all_trs_from_page(f'{BASE_URL}{category}/{page}') trs = await all_trs_from_page(f'{BASE_URL}{category}/{page}')
return data return data
def get_all_data(): async def get_all_data(asynchronous: bool = False) -> list[row_type]:
data: list[row_type] = [] if asynchronous:
for category in CATEGORIES: results = await asyncio.gather(*(all_data_from_category(category) for category in CATEGORIES))
data.extend(all_data_from_category(category)) else:
results = [await all_data_from_category(category) for category in CATEGORIES]
data = []
for result in results:
data.extend(result)
return data return data
def main() -> None: def main() -> None:
data = get_all_data() data = asyncio.run(get_all_data(True))
for tup in data: for tup in data:
print(tup) print(tup)
print(len(data), 'datasets') print(len(data), 'datasets')