first working draft for scraping all stock symbols (and supplementary data)
This commit is contained in:
parent
7588dc1600
commit
1dd2975a3c
@ -1,12 +1,33 @@
|
|||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from string import ascii_uppercase
|
||||||
|
|
||||||
from requests import get
|
from requests import get
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import Tag, ResultSet
|
from bs4.element import Tag, ResultSet
|
||||||
|
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
log.setLevel(logging.DEBUG)
|
||||||
|
ch = logging.StreamHandler()
|
||||||
|
ch.setLevel(logging.DEBUG)
|
||||||
|
log.addHandler(ch)
|
||||||
|
|
||||||
|
|
||||||
row_type = tuple[str, str, str, str, str]
|
row_type = tuple[str, str, str, str, str]
|
||||||
|
|
||||||
|
DOMAIN = 'www.marketwatch.com'
|
||||||
|
BASE_URL = f'https://{DOMAIN}/tools/markets/stocks/a-z/'
|
||||||
|
DIGIT_CATEGORY = '0-9'
|
||||||
|
OTHER_CATEGORY = 'Other'
|
||||||
|
CATEGORIES = [DIGIT_CATEGORY] + list(ascii_uppercase) + [OTHER_CATEGORY]
|
||||||
|
STOCK_SYMBOL_PATTERN = re.compile(r'\(([\w.&]+)\)')
|
||||||
|
|
||||||
|
|
||||||
|
class UnexpectedMarkupError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def data_from_rows(trs: ResultSet) -> list[row_type]:
|
def data_from_rows(trs: ResultSet) -> list[row_type]:
|
||||||
data: list[row_type] = []
|
data: list[row_type] = []
|
||||||
@ -19,8 +40,11 @@ def get_single_row_data(table_row: Tag) -> row_type:
|
|||||||
tds = table_row.find_all('td')
|
tds = table_row.find_all('td')
|
||||||
company_name = str(tds[0].a.contents[0]).strip()
|
company_name = str(tds[0].a.contents[0]).strip()
|
||||||
stock_symbol = str(tds[0].a.contents[1].contents[0]).strip()
|
stock_symbol = str(tds[0].a.contents[1].contents[0]).strip()
|
||||||
stock_symbol_pattern = re.compile(r'\(([\w.]+)\)')
|
m = re.search(STOCK_SYMBOL_PATTERN, stock_symbol)
|
||||||
stock_symbol = re.search(stock_symbol_pattern, stock_symbol).group(1)
|
if m is None:
|
||||||
|
log.error(f"{stock_symbol} did not match the stock symbol pattern")
|
||||||
|
else:
|
||||||
|
stock_symbol = m.group(1)
|
||||||
country = get_str_from_td(tds[1])
|
country = get_str_from_td(tds[1])
|
||||||
exchange = get_str_from_td(tds[2])
|
exchange = get_str_from_td(tds[2])
|
||||||
sector = get_str_from_td(tds[3])
|
sector = get_str_from_td(tds[3])
|
||||||
@ -35,20 +59,44 @@ def get_str_from_td(td: Tag) -> str:
|
|||||||
return str(content).strip()
|
return str(content).strip()
|
||||||
|
|
||||||
|
|
||||||
def all_trs_from_soup(soup: BeautifulSoup) -> ResultSet:
|
def all_trs_from_page(url: str) -> ResultSet:
|
||||||
|
response = get(url)
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
try:
|
||||||
return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr')
|
return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr')
|
||||||
|
except AttributeError:
|
||||||
|
log.error("Unexpected HTML markup!")
|
||||||
|
file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html'
|
||||||
|
with open(file_name, 'w') as f:
|
||||||
|
f.write(response.text)
|
||||||
|
raise UnexpectedMarkupError
|
||||||
|
|
||||||
|
|
||||||
def soup_from_page(url: str) -> BeautifulSoup:
|
def all_data_from_category(category: str) -> list[row_type]:
|
||||||
return BeautifulSoup(get(url).text, 'html.parser')
|
log.info(f"Getting companies starting with '{category}'")
|
||||||
|
data: list[row_type] = []
|
||||||
|
page = 1
|
||||||
|
trs = all_trs_from_page(f'{BASE_URL}{category}')
|
||||||
|
while len(trs) > 0:
|
||||||
|
log.info(f"Scraping page {page}")
|
||||||
|
data.extend(data_from_rows(trs))
|
||||||
|
page += 1
|
||||||
|
trs = all_trs_from_page(f'{BASE_URL}{category}/{page}')
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_data():
|
||||||
|
data: list[row_type] = []
|
||||||
|
for category in CATEGORIES:
|
||||||
|
data.extend(all_data_from_category(category))
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
soup = soup_from_page('https://www.marketwatch.com/tools/markets/stocks/a-z/0-9')
|
data = get_all_data()
|
||||||
trs = all_trs_from_soup(soup)
|
|
||||||
data = data_from_rows(trs)
|
|
||||||
for tup in data:
|
for tup in data:
|
||||||
print(tup)
|
print(tup)
|
||||||
|
print(len(data), 'datasets')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user