From 7588dc160016c8d9b43f22034f2bd85ae6cf2e27 Mon Sep 17 00:00:00 2001 From: Maximilian Fajnberg Date: Tue, 9 Nov 2021 15:33:03 +0100 Subject: [PATCH] getting data from entire page --- src/stock-symbol-scraper/main.py | 51 ++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/src/stock-symbol-scraper/main.py b/src/stock-symbol-scraper/main.py index 7a30ab6..d845ad5 100644 --- a/src/stock-symbol-scraper/main.py +++ b/src/stock-symbol-scraper/main.py @@ -2,29 +2,54 @@ import re from requests import get from bs4 import BeautifulSoup -from bs4.element import Tag +from bs4.element import Tag, ResultSet row_type = tuple[str, str, str, str, str] -def get_row_data(table_row: Tag) -> row_type: +def data_from_rows(trs: ResultSet) -> list[row_type]: + data: list[row_type] = [] + for row in trs: + data.append(get_single_row_data(row)) + return data + + +def get_single_row_data(table_row: Tag) -> row_type: tds = table_row.find_all('td') company_name = str(tds[0].a.contents[0]).strip() stock_symbol = str(tds[0].a.contents[1].contents[0]).strip() - stock_symbol_pattern = re.compile(r'\((\w+)\)') + stock_symbol_pattern = re.compile(r'\(([\w.]+)\)') stock_symbol = re.search(stock_symbol_pattern, stock_symbol).group(1) - country = str(tds[1].contents[0]) - exchange = str(tds[2].contents[0]) - sector = str(tds[3].contents[0]) + country = get_str_from_td(tds[1]) + exchange = get_str_from_td(tds[2]) + sector = get_str_from_td(tds[3]) return company_name, stock_symbol, country, exchange, sector +def get_str_from_td(td: Tag) -> str: + try: + content = td.contents[0] + except IndexError: + return '' + return str(content).strip() + + +def all_trs_from_soup(soup: BeautifulSoup) -> ResultSet: + return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr') + + +def soup_from_page(url: str) -> BeautifulSoup: + return BeautifulSoup(get(url).text, 'html.parser') + + +def main() -> None: + soup = soup_from_page('https://www.marketwatch.com/tools/markets/stocks/a-z/0-9') + trs = all_trs_from_soup(soup) + data = data_from_rows(trs) + for tup in data: + print(tup) + + if __name__ == '__main__': - response = get('https://www.marketwatch.com/tools/markets/stocks/a-z/0-9') - soup = BeautifulSoup(response.text, 'html.parser') - trs = soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr') - print(get_row_data(trs[0])) - print(get_row_data(trs[1])) - print(get_row_data(trs[149])) - data: list[row_type] = [] + main()