basic async capabilities added
This commit is contained in:
parent
59530b5913
commit
f92123bf4a
@ -4,7 +4,7 @@ Scrapes the entire list of stocks off of [MarketWatch.com](https://www.marketwat
|
|||||||
|
|
||||||
**NOTE**: No prices or financials of any kind are collected.
|
**NOTE**: No prices or financials of any kind are collected.
|
||||||
|
|
||||||
At this time the scraper performs this task purely **sequentially**. Asynchronous HTTP requests to drastically speed up the process will be implemented soon.
|
Asynchronous HTTP requests are currently used by default.
|
||||||
|
|
||||||
This is not intended to be run continuously since the data gathered this way will rarely change.
|
This is not intended to be run continuously since the data gathered this way will rarely change.
|
||||||
|
|
||||||
|
@ -1,2 +1,2 @@
|
|||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
requests
|
aiohttp
|
@ -22,7 +22,7 @@ packages = find:
|
|||||||
python_requires = >=3.7
|
python_requires = >=3.7
|
||||||
install_requires =
|
install_requires =
|
||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
requests
|
aiohttp
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
tests =
|
tests =
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import asyncio
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from string import ascii_uppercase
|
from string import ascii_uppercase
|
||||||
|
|
||||||
from requests import get
|
from aiohttp import ClientSession
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import Tag, ResultSet
|
from bs4.element import Tag, ResultSet
|
||||||
|
|
||||||
@ -59,41 +61,47 @@ def get_str_from_td(td: Tag) -> str:
|
|||||||
return str(content).strip()
|
return str(content).strip()
|
||||||
|
|
||||||
|
|
||||||
def all_trs_from_page(url: str) -> ResultSet:
|
async def all_trs_from_page(url: str) -> ResultSet:
|
||||||
response = get(url)
|
async with ClientSession() as session:
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
async with session.get(url) as response:
|
||||||
|
html = await response.text()
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
try:
|
try:
|
||||||
return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr')
|
return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr')
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
log.error("Unexpected HTML markup!")
|
log.error("Unexpected HTML markup!")
|
||||||
file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html'
|
file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html'
|
||||||
with open(file_name, 'w') as f:
|
with open(file_name, 'w') as f:
|
||||||
f.write(response.text)
|
f.write(html)
|
||||||
raise UnexpectedMarkupError
|
raise UnexpectedMarkupError
|
||||||
|
|
||||||
|
|
||||||
def all_data_from_category(category: str) -> list[row_type]:
|
async def all_data_from_category(category: str) -> list[row_type]:
|
||||||
log.info(f"Getting companies starting with '{category}'")
|
log.info(f"Getting companies starting with '{category}'")
|
||||||
data: list[row_type] = []
|
data: list[row_type] = []
|
||||||
page = 1
|
page = 1
|
||||||
trs = all_trs_from_page(f'{BASE_URL}{category}')
|
trs = await all_trs_from_page(f'{BASE_URL}{category}')
|
||||||
while len(trs) > 0:
|
while len(trs) > 0:
|
||||||
log.info(f"Scraping page {page}")
|
log.info(f"Scraping page {page}")
|
||||||
data.extend(data_from_rows(trs))
|
data.extend(data_from_rows(trs))
|
||||||
page += 1
|
page += 1
|
||||||
trs = all_trs_from_page(f'{BASE_URL}{category}/{page}')
|
trs = await all_trs_from_page(f'{BASE_URL}{category}/{page}')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def get_all_data():
|
async def get_all_data(asynchronous: bool = False) -> list[row_type]:
|
||||||
data: list[row_type] = []
|
if asynchronous:
|
||||||
for category in CATEGORIES:
|
results = await asyncio.gather(*(all_data_from_category(category) for category in CATEGORIES))
|
||||||
data.extend(all_data_from_category(category))
|
else:
|
||||||
|
results = [await all_data_from_category(category) for category in CATEGORIES]
|
||||||
|
data = []
|
||||||
|
for result in results:
|
||||||
|
data.extend(result)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
data = get_all_data()
|
data = asyncio.run(get_all_data(True))
|
||||||
for tup in data:
|
for tup in data:
|
||||||
print(tup)
|
print(tup)
|
||||||
print(len(data), 'datasets')
|
print(len(data), 'datasets')
|
||||||
|
Loading…
Reference in New Issue
Block a user