From 1dd2975a3c38abd732d51832ca8c161b84a92394 Mon Sep 17 00:00:00 2001 From: Daniil Fajnberg Date: Tue, 9 Nov 2021 17:03:20 +0100 Subject: [PATCH] first working draft for scraping all stock symbols (and supplementary data) --- src/stock-symbol-scraper/main.py | 66 +++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 9 deletions(-) diff --git a/src/stock-symbol-scraper/main.py b/src/stock-symbol-scraper/main.py index d845ad5..cd1d363 100644 --- a/src/stock-symbol-scraper/main.py +++ b/src/stock-symbol-scraper/main.py @@ -1,12 +1,33 @@ +import logging import re +from datetime import datetime +from string import ascii_uppercase from requests import get from bs4 import BeautifulSoup from bs4.element import Tag, ResultSet +log = logging.getLogger(__name__) +log.setLevel(logging.DEBUG) +ch = logging.StreamHandler() +ch.setLevel(logging.DEBUG) +log.addHandler(ch) + + row_type = tuple[str, str, str, str, str] +DOMAIN = 'www.marketwatch.com' +BASE_URL = f'https://{DOMAIN}/tools/markets/stocks/a-z/' +DIGIT_CATEGORY = '0-9' +OTHER_CATEGORY = 'Other' +CATEGORIES = [DIGIT_CATEGORY] + list(ascii_uppercase) + [OTHER_CATEGORY] +STOCK_SYMBOL_PATTERN = re.compile(r'\(([\w.&]+)\)') + + +class UnexpectedMarkupError(Exception): + pass + def data_from_rows(trs: ResultSet) -> list[row_type]: data: list[row_type] = [] @@ -19,8 +40,11 @@ def get_single_row_data(table_row: Tag) -> row_type: tds = table_row.find_all('td') company_name = str(tds[0].a.contents[0]).strip() stock_symbol = str(tds[0].a.contents[1].contents[0]).strip() - stock_symbol_pattern = re.compile(r'\(([\w.]+)\)') - stock_symbol = re.search(stock_symbol_pattern, stock_symbol).group(1) + m = re.search(STOCK_SYMBOL_PATTERN, stock_symbol) + if m is None: + log.error(f"{stock_symbol} did not match the stock symbol pattern") + else: + stock_symbol = m.group(1) country = get_str_from_td(tds[1]) exchange = get_str_from_td(tds[2]) sector = get_str_from_td(tds[3]) @@ -35,20 +59,44 @@ def get_str_from_td(td: Tag) -> str: return str(content).strip() -def all_trs_from_soup(soup: BeautifulSoup) -> ResultSet: - return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr') +def all_trs_from_page(url: str) -> ResultSet: + response = get(url) + soup = BeautifulSoup(response.text, 'html.parser') + try: + return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr') + except AttributeError: + log.error("Unexpected HTML markup!") + file_name = f'unexpected_response_at_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.html' + with open(file_name, 'w') as f: + f.write(response.text) + raise UnexpectedMarkupError -def soup_from_page(url: str) -> BeautifulSoup: - return BeautifulSoup(get(url).text, 'html.parser') +def all_data_from_category(category: str) -> list[row_type]: + log.info(f"Getting companies starting with '{category}'") + data: list[row_type] = [] + page = 1 + trs = all_trs_from_page(f'{BASE_URL}{category}') + while len(trs) > 0: + log.info(f"Scraping page {page}") + data.extend(data_from_rows(trs)) + page += 1 + trs = all_trs_from_page(f'{BASE_URL}{category}/{page}') + return data + + +def get_all_data(): + data: list[row_type] = [] + for category in CATEGORIES: + data.extend(all_data_from_category(category)) + return data def main() -> None: - soup = soup_from_page('https://www.marketwatch.com/tools/markets/stocks/a-z/0-9') - trs = all_trs_from_soup(soup) - data = data_from_rows(trs) + data = get_all_data() for tup in data: print(tup) + print(len(data), 'datasets') if __name__ == '__main__':