getting data from entire page
This commit is contained in:
parent
29b37c756d
commit
7588dc1600
@ -2,29 +2,54 @@ import re
|
|||||||
|
|
||||||
from requests import get
|
from requests import get
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag, ResultSet
|
||||||
|
|
||||||
|
|
||||||
row_type = tuple[str, str, str, str, str]
|
row_type = tuple[str, str, str, str, str]
|
||||||
|
|
||||||
|
|
||||||
def get_row_data(table_row: Tag) -> row_type:
|
def data_from_rows(trs: ResultSet) -> list[row_type]:
|
||||||
|
data: list[row_type] = []
|
||||||
|
for row in trs:
|
||||||
|
data.append(get_single_row_data(row))
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_single_row_data(table_row: Tag) -> row_type:
|
||||||
tds = table_row.find_all('td')
|
tds = table_row.find_all('td')
|
||||||
company_name = str(tds[0].a.contents[0]).strip()
|
company_name = str(tds[0].a.contents[0]).strip()
|
||||||
stock_symbol = str(tds[0].a.contents[1].contents[0]).strip()
|
stock_symbol = str(tds[0].a.contents[1].contents[0]).strip()
|
||||||
stock_symbol_pattern = re.compile(r'\((\w+)\)')
|
stock_symbol_pattern = re.compile(r'\(([\w.]+)\)')
|
||||||
stock_symbol = re.search(stock_symbol_pattern, stock_symbol).group(1)
|
stock_symbol = re.search(stock_symbol_pattern, stock_symbol).group(1)
|
||||||
country = str(tds[1].contents[0])
|
country = get_str_from_td(tds[1])
|
||||||
exchange = str(tds[2].contents[0])
|
exchange = get_str_from_td(tds[2])
|
||||||
sector = str(tds[3].contents[0])
|
sector = get_str_from_td(tds[3])
|
||||||
return company_name, stock_symbol, country, exchange, sector
|
return company_name, stock_symbol, country, exchange, sector
|
||||||
|
|
||||||
|
|
||||||
|
def get_str_from_td(td: Tag) -> str:
|
||||||
|
try:
|
||||||
|
content = td.contents[0]
|
||||||
|
except IndexError:
|
||||||
|
return ''
|
||||||
|
return str(content).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def all_trs_from_soup(soup: BeautifulSoup) -> ResultSet:
|
||||||
|
return soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr')
|
||||||
|
|
||||||
|
|
||||||
|
def soup_from_page(url: str) -> BeautifulSoup:
|
||||||
|
return BeautifulSoup(get(url).text, 'html.parser')
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
soup = soup_from_page('https://www.marketwatch.com/tools/markets/stocks/a-z/0-9')
|
||||||
|
trs = all_trs_from_soup(soup)
|
||||||
|
data = data_from_rows(trs)
|
||||||
|
for tup in data:
|
||||||
|
print(tup)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
response = get('https://www.marketwatch.com/tools/markets/stocks/a-z/0-9')
|
main()
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
|
||||||
trs = soup.find('div', {'id': 'marketsindex'}).table.tbody.find_all('tr')
|
|
||||||
print(get_row_data(trs[0]))
|
|
||||||
print(get_row_data(trs[1]))
|
|
||||||
print(get_row_data(trs[149]))
|
|
||||||
data: list[row_type] = []
|
|
||||||
|
Loading…
Reference in New Issue
Block a user